##1. Data Generation & Dataset Creation

###1.1. Data Loading and Preprocessing

make sure to upload md and pdf files into `/data` directory

In [None]:
import os
import re
from PyPDF2 import PdfReader

def load_and_preprocess_data(data_dir):
    """Loads and preprocesses Markdown and PDF files."""
    documents = []
    for filename in os.listdir(data_dir):
        filepath = os.path.join(data_dir, filename)
        if filename.endswith(".md"):
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                text = re.sub(r"^[#\s]+", "", text, flags=re.MULTILINE)
                documents.append({"filename": filename, "text": text})
        elif filename.endswith(".pdf"):
            try:
                with open(filepath, "rb") as f:
                    pdf_reader = PdfReader(f)
                    text = ""
                    for page in pdf_reader.pages:
                        text += page.extract_text()
                    # cleaning
                    text = re.sub(r"\n+", "\n", text)
                    text = re.sub(r" +", " ", text)
                    documents.append({"filename": filename, "text": text})
            except Exception as e:
                print(f"Error reading PDF {filename}: {e}")
    return documents


data_dir = "data"
documents = load_and_preprocess_data(data_dir)
# print(documents[0]['text'][:500]) # check a document

###1.2. Synthetic Data Generation (using Qwen2.5-3B-Instruct itself, initially)

In [None]:
from transformers import pipeline, AutoTokenizer

def generate_qa_pairs(documents, model_name="Qwen/Qwen2.5-3B-Instruct", num_questions_per_doc=5):
    """Generates QA pairs using the base Qwen model."""

    generator = pipeline('text-generation', model=model_name, device=0)  # Use GPU if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    qa_pairs = []
    for doc in documents:

        text_chunks = [doc['text'][i:i+4096] for i in range(0, len(doc['text']), 4096)]

        for chunk in text_chunks:
            prompt = f"""
            Context:
            {chunk}

            Based on the above context, generate {num_questions_per_doc} question and answer pairs.
            Format them strictly as follows:

            Q: [Question 1]
            A: [Answer 1]

            Q: [Question 2]
            A: [Answer 2]

            ...
            """

            # Generate text using model
            generated_text = generator(
                prompt,
                max_length=1024,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.7
            )[0]['generated_text']


            matches = re.findall(r"Q: (.*?)\nA: (.*?)(?=\nQ:|\Z)", generated_text, re.DOTALL)
            for question, answer in matches:
                qa_pairs.append({"question": question.strip(), "answer": answer.strip(), "source": doc['filename']})
    return qa_pairs


initial_qa_pairs = generate_qa_pairs(documents, num_questions_per_doc=3)
# print(initial_qa_pairs[:5])

###1.3. Data Augmentation and Refinement

In [None]:
from transformers import pipeline

def paraphrase_qa(qa_pairs, model_name="google/flan-t5-base"):
  """Paraphrases questions and answers for data augmentation."""
  paraphraser = pipeline("text2text-generation", model=model_name, device=0) # Use GPU
  augmented_qa_pairs = []

  for pair in qa_pairs:

    prompt_q = f"paraphrase: {pair['question']}"
    paraphrased_question = paraphraser(prompt_q, max_length=128, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)[0]['generated_text']


    prompt_a = f"paraphrase: {pair['answer']}"
    paraphrased_answer = paraphraser(prompt_a, max_length=256, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)[0]['generated_text']

    augmented_qa_pairs.append({
      "question": paraphrased_question.strip(),
      "answer": paraphrased_answer.strip(),
      "original_question": pair['question'],
      "original_answer": pair['answer'],
      "source": pair['source']
    })
  return augmented_qa_pairs

augmented_pairs = paraphrase_qa(initial_qa_pairs)
all_qa_pairs = initial_qa_pairs + augmented_pairs

###1.4. Dataset Splitting

In [None]:
import random

def split_dataset(qa_pairs, train_ratio=0.8, val_ratio=0.1):
    """Splits the dataset into training, validation, and test sets."""
    random.shuffle(qa_pairs)
    train_size = int(len(qa_pairs) * train_ratio)
    val_size = int(len(qa_pairs) * val_ratio)
    train_data = qa_pairs[:train_size]
    val_data = qa_pairs[train_size:train_size + val_size]
    test_data = qa_pairs[train_size + val_size:]
    return train_data, val_data, test_data

train_data, val_data, test_data = split_dataset(all_qa_pairs)
print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")

###1.5 Dataset Formatting (JSONL)

In [None]:
import json

def create_jsonl_dataset(qa_pairs, filename):
    """Creates a JSONL dataset file."""
    with open(filename, "w", encoding="utf-8") as f:
        for pair in qa_pairs:
           prompt = f"Question: {pair['question']}\n"
           if 'context' in pair:
             prompt += f"Context: {pair['context']}\n"

           # Create the JSON object
           data_point = {
               "prompt": prompt,
               "response": pair['answer']
           }
           f.write(json.dumps(data_point) + "\n")

create_jsonl_dataset(train_data, "train.jsonl")
create_jsonl_dataset(val_data, "val.jsonl")
create_jsonl_dataset(test_data, "test.jsonl")

##2. Model Selection and Preparation

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "Qwen/Qwen2.5-3B-Instruct"

# Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Use "cuda" if you have a GPU, "cpu" otherwise
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

##3. Efficient Fine-tuning (QLoRA)

In [None]:
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --upgrade transformers

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = True,
)


model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
)


from datasets import load_dataset
train_dataset = load_dataset("json", data_files="train.jsonl", split="train")
val_dataset = load_dataset("json", data_files="val.jsonl", split="train")

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen-finetuned",  # Output directory
    per_device_train_batch_size=4,  # Batch size per GPU
    gradient_accumulation_steps=4,   # Accumulate gradients over several steps
    learning_rate=2e-4,             # Learning rate
    fp16=False,                     # Use bfloat16 (more stable, better for Qwen2)
    bf16=True,
    logging_steps=10,             # Log training information
    save_steps=100,
    evaluation_strategy="steps",      # Evaluate during training
    eval_steps=100,                # Evaluation interval
    num_train_epochs=3,          # Number of training epochs (adjust as needed)
    warmup_steps=100,              # Warmup steps for learning rate scheduler
    lr_scheduler_type="cosine",      # Learning rate scheduler
    remove_unused_columns=False,    # Important!
    report_to="tensorboard",
)



trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="prompt",
    packing=False,
    max_seq_length=2048,
    tokenizer=tokenizer,
)


# Train the model
trainer.train()

# Save the LoRA adapters
trainer.save_model("./qwen-finetuned-adapters")