In [23]:
!pip uninstall -y fastai timm textblob peft colbert colbert-ai unsloth unsloth_zoo
!pip uninstall -y torch torchvision torchaudio xformers nltk trl transformers accelerate datasets sentence-transformers bitsandbytes rank_bm25 llama-cpp-python
!pip cache purge

print("\n[INFO] Installing required packages...please wait.")
!pip install --quiet --no-cache-dir bitsandbytes
!pip install --quiet --no-cache-dir --upgrade git+https://github.com/unslothai/unsloth.git
!pip install --quiet --no-cache-dir unsloth_zoo
!pip install --quiet --no-cache-dir PyPDF2
!pip install --quiet --no-cache-dir faiss-cpu sentence-transformers
!pip install --quiet --no-cache-dir rank_bm25
!pip install --quiet --no-cache-dir llama-cpp-python
!pip install --quiet --no-cache-dir trl datasets xformers accelerate huggingface_hub
!pip install --quiet --no-cache-dir transformers
!pip install --quiet --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install --quiet --no-cache-dir nltk
!pip install --quiet --no-cache-dir peft numpy scipy pandas matplotlib sentencepiece pyyaml
!pip install --quiet --no-cache-dir rouge-score evaluate
!pip install --quiet --no-cache-dir --upgrade torch torchvision torchaudio transformers

print("[INFO] Downloading NLTK data to avoid LookupError.")
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
print("\n✅ All libraries installed & NLTK resources downloaded. Please ensure GPU is enabled. Restart if needed.")


Found existing installation: fastai 2.7.18
Uninstalling fastai-2.7.18:
  Successfully uninstalled fastai-2.7.18
Found existing installation: timm 1.0.15
Uninstalling timm-1.0.15:
  Successfully uninstalled timm-1.0.15
Found existing installation: textblob 0.19.0
Uninstalling textblob-0.19.0:
  Successfully uninstalled textblob-0.19.0
Found existing installation: peft 0.14.0
Uninstalling peft-0.14.0:
  Successfully uninstalled peft-0.14.0
[0mFound existing installation: unsloth 2025.3.9
Uninstalling unsloth-2025.3.9:
  Successfully uninstalled unsloth-2025.3.9
Found existing installation: unsloth_zoo 2025.3.8
Uninstalling unsloth_zoo-2025.3.8:
  Successfully uninstalled unsloth_zoo-2025.3.8
Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Successfully uninstalled torchvision-0.21.0
Found existing installation: torchaudio 2.5.1+cu124
Uninstalling to

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...



✅ All libraries installed & NLTK resources downloaded. Please ensure GPU is enabled. Restart if needed.


[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
# =======================
# 3) File Upload (Colab)
# =======================
from google.colab import files
uploaded = files.upload()  # Upload your PDF/MD files if needed

Saving 2501.12948v1.pdf to 2501.12948v1.pdf
Saving dataset.md to dataset.md
Saving deepseekv3-cost-explained.md to deepseekv3-cost-explained.md
Saving deepseekv3-explained.md to deepseekv3-explained.md
Saving design-notes-3fs.md to design-notes-3fs.md
Saving open-source-week.md to open-source-week.md


In [24]:
# ===============
# 2) Imports
# ===============
import os
import PyPDF2
import nltk
import time
import logging
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, TextStreamer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, load_dataset, concatenate_datasets
from rank_bm25 import BM25Okapi
from llama_cpp import Llama

# Unsloth imports
import unsloth  # Must come before 'transformers', 'peft', etc.
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
from trl import ORPOTrainer

logging.getLogger("transformers").setLevel(logging.ERROR)

print("✅ Imports loaded successfully! No ColBERT references remain.")

✅ Imports loaded successfully! No ColBERT references remain.


In [25]:
# =======================
# 3) File Upload (Colab)
# =======================
uploaded = files.upload()  # Upload your PDF/MD files if needed

In [26]:
# ======================================
# (D) EXTRACT TEXT & CHUNKING
# ======================================
def extract_text_from_files(paths):
    """Extract raw text from PDFs & .md files."""
    all_text = []
    for path in paths:
        if path.endswith(".pdf"):
            with open(path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                all_text.append(text)
        else:
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
                all_text.append(text)
    return all_text

def sliding_window_chunking(text, chunk_size=300, stride=150):
    """Overlapping chunks with sentence-level tokenization."""
    from nltk import sent_tokenize
    sentences = sent_tokenize(text)
    chunks = []
    start_idx = 0
    while start_idx < len(sentences):
        current_chunk = []
        word_count = 0
        idx = start_idx
        while idx < len(sentences) and (word_count + len(sentences[idx].split()) <= chunk_size):
            current_chunk.append(sentences[idx])
            word_count += len(sentences[idx].split())
            idx += 1
        chunks.append(" ".join(current_chunk))
        start_idx = max(idx - (chunk_size // stride), start_idx + 1)
    return chunks

# Example files (replace or remove if not needed)
file_paths = [
    "/content/2501.12948v1.pdf",
    "/content/dataset.md",
    "/content/deepseekv3-cost-explained.md",
    "/content/deepseekv3-explained.md",
    "/content/design-notes-3fs.md",
    "/content/open-source-week.md",
]

raw_texts = extract_text_from_files(file_paths)
all_chunks = []
for text in raw_texts:
    these_chunks = sliding_window_chunking(text, chunk_size=300, stride=150)
    all_chunks.extend(these_chunks)

if not all_chunks:
    print("[WARNING] No real files found or empty. Using placeholder chunks.")
    all_chunks = [
        "DeepSeek R1 is a model specialized for AI research tasks, excelling in MMLU and GPQA benchmarks.",
        "BM25 is a sparse retrieval method based on term frequency and inverse document frequency.",
        "FAISS is a fast nearest-neighbor search library for embeddings."
    ]
print(f"[INFO] {len(all_chunks)} total chunks of text prepared.")

[INFO] 88 total chunks of text prepared.


In [27]:
# ======================================
# (E) FAISS + BM25 RETRIEVAL SETUP
# ======================================
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
d = 384
faiss_index = faiss.IndexFlatL2(d)
chunk_mapping = {}

for i, chunk in enumerate(all_chunks):
    emb = embedding_model.encode(chunk, convert_to_numpy=True)
    faiss_index.add(np.array([emb]))
    chunk_mapping[i] = chunk

print(f"[INFO] FAISS Index has {faiss_index.ntotal} chunks.")

tokenized_chunks = [nltk.word_tokenize(ch.lower()) for ch in all_chunks]
bm25 = BM25Okapi(tokenized_chunks)
print("[INFO] BM25 index built.")

def retrieve_faiss(query, top_k=3):
    q_vec = embedding_model.encode(query, convert_to_numpy=True)
    _, indices = faiss_index.search(np.array([q_vec]), top_k)
    return [chunk_mapping[idx] for idx in indices[0] if idx in chunk_mapping]

def retrieve_bm25(query, top_k=3):
    toks = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(toks)
    idxs = np.argsort(scores)[::-1][:top_k]
    return [all_chunks[i] for i in idxs]

def retrieve_hybrid(query, top_k=3):
    faiss_res = retrieve_faiss(query, top_k)
    bm25_res  = retrieve_bm25(query, top_k)
    return list(set(faiss_res + bm25_res))

[INFO] FAISS Index has 88 chunks.
[INFO] BM25 index built.


In [28]:
# ======================================
# (F) SUMMARIZATION / CHAIN-OF-THOUGHT
# ======================================
try:
    summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0)
    print("[INFO] Summarizer on GPU.")
except:
    summarizer = pipeline("summarization", model="google/pegasus-xsum", device=-1)
    print("[INFO] Summarizer on CPU.")

def chain_of_thought(query, top_k=3):
    ret = retrieve_hybrid(query, top_k)
    if not ret:
        return "No relevant info found."
    ctx = " ".join(ret)
    prompt = f"Step-by-step, explain:\n{query}\nContext:\n{ctx}"
    out = summarizer(prompt, max_length=256, min_length=50, do_sample=False, truncation=True)
    return f"Chain-of-Thought:\n{out[0]['summary_text']}"

[INFO] Summarizer on GPU.


In [30]:
# ======================================
# (G) LOAD HOTPOTQA & SCIQA
# ======================================
print("\n[INFO] (Optional) Loading HotpotQA, SciQA for multi-hop reasoning & AI QAs...")

try:
    hotpot_qa = load_dataset("hotpot_qa", "fullwiki", split="train[:500]", trust_remote_code=True)
    print("HotpotQA sample:", hotpot_qa[0])
except Exception as e:
    print(f"Could not load HotpotQA. Error: {e}")
    hotpot_qa = None

def format_hotpot_qa(ex):
    question = ex["question"]
    answer = ex["answer"]
    sfs = ex.get("supporting_facts", [])
    sfs_str = " -> ".join(str(x) for x in sfs)
    return {"text": f"Q: {question}\nStep-by-step reasoning: {sfs_str}\nFinal Answer: {answer}"}

if hotpot_qa is not None:
    hotpot_qa = hotpot_qa.map(format_hotpot_qa)

try:
    sciqa = load_dataset("allenai/sciqa", split="train[:500]", trust_remote_code=True)
    print("SciQA sample:", sciqa[0])
except Exception as e:
    print(f"Could not load SciQA. Error: {e}")
    sciqa = None

def format_sciqa(ex):
    q = ex["question"]
    a = ex["answer"]
    c = ex["context"]
    return {"text": f"Q: {q}\nContext: {c}\nA: {a}"}

if sciqa is not None:
    sciqa = sciqa.map(format_sciqa)


[INFO] (Optional) Loading HotpotQA, SciQA for multi-hop reasoning & AI QAs...
HotpotQA sample: {'id': '5a7a06935542990198eaf050', 'question': "Which magazine was started first Arthur's Magazine or First for Women?", 'answer': "Arthur's Magazine", 'type': 'comparison', 'level': 'medium', 'supporting_facts': {'title': ["Arthur's Magazine", 'First for Women'], 'sent_id': [0, 0]}, 'context': {'title': ['Radio City (Indian radio station)', 'History of Albanian football', 'Echosmith', "Women's colleges in the Southern United States", 'First Arthur County Courthouse and Jail', "Arthur's Magazine", '2014–15 Ukrainian Hockey Championship', 'First for Women', 'Freeway Complex Fire', 'William Rast'], 'sentences': [["Radio City is India's first private FM radio station and was started on 3 July 2001.", ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).', ' It plays Hindi

In [31]:
# ======================================
# (H) SYNTHETIC QA FROM YOUR CHUNKS
# ======================================
print("\n[INFO] Generating synthetic QA from your chunks (demo)...")

synthetic_data = []
for i, ch in enumerate(all_chunks):
    print(f"[DEBUG] Summarizing chunk {i+1}/{len(all_chunks)}")
    question = "What is the main idea of this text?\n\n" + ch
    try:
        su = summarizer(ch, max_length=256, min_length=50, do_sample=False, truncation=True)
        summary_text = su[0]['summary_text']
    except:
        summary_text = "No summary"
    answer = "This text is about: " + summary_text
    synthetic_data.append({"conversations": [
        {"role": "system", "content": "You are Qwen, an AI model."},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]})

synthetic_dataset = Dataset.from_list(synthetic_data)
print("Synthetic dataset size:", len(synthetic_dataset))


[INFO] Generating synthetic QA from your chunks (demo)...
[DEBUG] Summarizing chunk 1/88
[DEBUG] Summarizing chunk 2/88
[DEBUG] Summarizing chunk 3/88
[DEBUG] Summarizing chunk 4/88
[DEBUG] Summarizing chunk 5/88
[DEBUG] Summarizing chunk 6/88
[DEBUG] Summarizing chunk 7/88
[DEBUG] Summarizing chunk 8/88
[DEBUG] Summarizing chunk 9/88
[DEBUG] Summarizing chunk 10/88
[DEBUG] Summarizing chunk 11/88
[DEBUG] Summarizing chunk 12/88
[DEBUG] Summarizing chunk 13/88
[DEBUG] Summarizing chunk 14/88
[DEBUG] Summarizing chunk 15/88
[DEBUG] Summarizing chunk 16/88
[DEBUG] Summarizing chunk 17/88
[DEBUG] Summarizing chunk 18/88
[DEBUG] Summarizing chunk 19/88
[DEBUG] Summarizing chunk 20/88
[DEBUG] Summarizing chunk 21/88
[DEBUG] Summarizing chunk 22/88
[DEBUG] Summarizing chunk 23/88
[DEBUG] Summarizing chunk 24/88
[DEBUG] Summarizing chunk 25/88
[DEBUG] Summarizing chunk 26/88
[DEBUG] Summarizing chunk 27/88
[DEBUG] Summarizing chunk 28/88
[DEBUG] Summarizing chunk 29/88
[DEBUG] Summarizing ch

In [10]:
# ======================================
# (I) COMBINE ALL DATASETS
# ======================================
all_datasets = [synthetic_dataset]
if hotpot_qa is not None:
    all_datasets.append(hotpot_qa)
if sciqa is not None:
    all_datasets.append(sciqa)

if len(all_datasets) > 1:
    print("[INFO] Merging synthetic, Hotpot, SciQA datasets...")
    combined_dataset = concatenate_datasets(all_datasets)
else:
    print("[INFO] Only synthetic dataset used.")
    combined_dataset = all_datasets[0]

print("Final combined dataset length:", len(combined_dataset))


[INFO] Generating synthetic QA from your chunks (demo)...
[DEBUG] Summarizing chunk 1/88
[DEBUG] Summarizing chunk 2/88
[DEBUG] Summarizing chunk 3/88
[DEBUG] Summarizing chunk 4/88
[DEBUG] Summarizing chunk 5/88
[DEBUG] Summarizing chunk 6/88
[DEBUG] Summarizing chunk 7/88
[DEBUG] Summarizing chunk 8/88
[DEBUG] Summarizing chunk 9/88
[DEBUG] Summarizing chunk 10/88
[DEBUG] Summarizing chunk 11/88
[DEBUG] Summarizing chunk 12/88
[DEBUG] Summarizing chunk 13/88
[DEBUG] Summarizing chunk 14/88
[DEBUG] Summarizing chunk 15/88
[DEBUG] Summarizing chunk 16/88
[DEBUG] Summarizing chunk 17/88
[DEBUG] Summarizing chunk 18/88
[DEBUG] Summarizing chunk 19/88
[DEBUG] Summarizing chunk 20/88
[DEBUG] Summarizing chunk 21/88
[DEBUG] Summarizing chunk 22/88
[DEBUG] Summarizing chunk 23/88
[DEBUG] Summarizing chunk 24/88
[DEBUG] Summarizing chunk 25/88
[DEBUG] Summarizing chunk 26/88
[DEBUG] Summarizing chunk 27/88
[DEBUG] Summarizing chunk 28/88
[DEBUG] Summarizing chunk 29/88
[DEBUG] Summarizing ch

In [32]:
# ======================================
# (J) TRAIN / VAL / TEST SPLIT
# ======================================
split_1 = combined_dataset.train_test_split(test_size=0.2, seed=42)
train_val_data = split_1["train"]
test_data      = split_1["test"]

split_2 = train_val_data.train_test_split(test_size=0.25, seed=42)
train_data = split_2["train"]
val_data   = split_2["test"]

print("Train size:", len(train_data))
print("Val size:", len(val_data))
print("Test size:", len(test_data))

Train size: 352
Val size: 118
Test size: 118


In [34]:
# ======================================
# (K) LOAD QWEN2.5 & FINE-TUNE with Unsloth
# ======================================
print("\n[INFO] Loading Qwen2.5-3B-Instruct model (4-bit) via Unsloth...")

model_name = "Qwen/Qwen2.5-3B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
print("✅ Qwen model loaded successfully.")

# Apply chat template
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

# Updated safe tokenization: if both "question" and "answer" exist, build a conversation.
def safe_format_qwen(example):
    if "question" in example and "answer" in example:
        conversation = (
            "<|im_start|>user\n" + example["question"] + "\n<|im_end|>\n" +
            "<|im_start|>assistant\n" + example["answer"] + "\n<|im_end|>"
        )
        return {"text": conversation}
    elif "text" in example and example["text"] is not None:
        # If already a conversation, use the provided text.
        return {"text": tokenizer.apply_chat_template(example["text"], tokenize=False, add_generation_prompt=False)}
    else:
        return {"text": ""}

# Filter and map tokenization function remains the same.
def filter_and_map(dataset):
    if dataset is None or len(dataset) == 0:
        return dataset
    ds_filtered = dataset.filter(lambda x: ("text" in x and x["text"] is not None) or ("question" in x and "answer" in x))
    ds_mapped   = ds_filtered.map(safe_format_qwen)
    return ds_mapped

train_data = filter_and_map(train_data)
val_data   = filter_and_map(val_data)
test_data  = filter_and_map(test_data)

print("\n✅ Tokenization applied successfully!")
print("🔍 Sample train_data before fine-tuning:")
if train_data is not None and len(train_data) > 0:
    print(train_data[0])

# Get the PEFT model with LoRA configuration.
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    dataset_text_field="text",
    max_seq_length=2048,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        max_steps=200,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs_qwen3b",
        report_to="none",
    ),
)

print("\n✅ Applying `train_on_responses_only`...")
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user\n",
    response_part="<|im_start|>assistant\n",
)

print("[INFO] Starting SFT fine-tuning ...")
try:
    trainer.train()
except ZeroDivisionError as e:
    print("\n❌ ZeroDivisionError: Possibly all tokens are masked. Check your dataset format.\n", e)




[INFO] Loading Qwen2.5-3B-Instruct model (4-bit) via Unsloth...
==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Qwen model loaded successfully.


Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Filter:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]


✅ Tokenization applied successfully!
🔍 Sample train_data before fine-tuning:
{'conversations': None, 'id': '5a72a00d5542991f9a20c53c', 'question': 'Who developed the prototype pacemaker used by the 34th President of the USA?', 'answer': 'R Adams Cowley', 'type': 'bridge', 'level': 'medium', 'supporting_facts': {'title': ['R Adams Cowley', 'R Adams Cowley', 'Dwight D. Eisenhower'], 'sent_id': [0, 7, 0]}, 'context': {'title': ['Dwight D. Eisenhower', '34th Battalion (Australia)', "2013 America's Cup", '34th Armoured Brigade (United Kingdom)', 'Andrés Avelino Cáceres', 'R Adams Cowley', 'David Eisenhower', 'Black Bat Squadron', '34th Street (IRT Second Avenue Line)', 'East 34th – Campus (RTA Rapid Transit station)'], 'sentences': [['Dwight David "Ike" Eisenhower ( ; October 14, 1890 – March 28, 1969) was an American politician and Army general who served as the 34th President of the United States from 1953 until 1961.', ' During World War II, he was a five-star general in the United Stat



Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/98 [00:00<?, ? examples/s]


✅ Applying `train_on_responses_only`...


  super().__init__(


Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/98 [00:00<?, ? examples/s]

[INFO] Starting SFT fine-tuning ...
{'loss': 7.5771, 'grad_norm': 7.522629737854004, 'learning_rate': 0.000195, 'epoch': 0.13333333333333333}
{'loss': 1.7484, 'grad_norm': 4.5851898193359375, 'learning_rate': 0.00018500000000000002, 'epoch': 0.26666666666666666}
{'loss': 1.1359, 'grad_norm': 3.611201047897339, 'learning_rate': 0.000175, 'epoch': 0.4}
{'loss': 1.0436, 'grad_norm': 3.4510090351104736, 'learning_rate': 0.000165, 'epoch': 0.5333333333333333}
{'loss': 1.0289, 'grad_norm': 2.5248961448669434, 'learning_rate': 0.000155, 'epoch': 0.6666666666666666}
{'loss': 1.0333, 'grad_norm': 3.28485369682312, 'learning_rate': 0.000145, 'epoch': 0.8}
{'loss': 1.2719, 'grad_norm': 8.654840469360352, 'learning_rate': 0.00013500000000000003, 'epoch': 0.9333333333333333}
{'loss': 0.8522, 'grad_norm': 1.945728063583374, 'learning_rate': 0.000125, 'epoch': 1.0666666666666667}
{'loss': 0.8589, 'grad_norm': 1.4891570806503296, 'learning_rate': 0.00011499999999999999, 'epoch': 1.2}
{'loss': 0.6675, 

In [45]:
# ======================================
# (M) TEST MODEL with RAG
# ======================================
def generate_answer_with_rag(query, top_k=3):
    """
    Retrieve top-K chunks from FAISS + BM25, summarize them with chain-of-thought.
    """
    ret = retrieve_hybrid(query, top_k)
    if not ret:
        return "No relevant info found."
    ctx = " ".join(ret)
    prompt = f"Step-by-step, explain the following:\n{query}\nContext:\n{ctx}"
    out = summarizer(prompt, max_length=256, min_length=50, do_sample=False, truncation=True)
    return f"The text is about: {out[0]['summary_text']}"

test_query = "How does DeepSeek R1 perform on MMLU and GPQADiamond benchmarks?"
rag_answer = generate_answer_with_rag(test_query, top_k=3)
print(f"\n[FINAL RAG ANSWER for '{test_query}']:\n{rag_answer}")



[FINAL RAG ANSWER for 'How does DeepSeek R1 perform on MMLU and GPQADiamond benchmarks?']:
The text is about: In this paper, we show how DeepSeek-R1 can outperform reasoning-focused models on reasoning-related benchmarks such as MMLU, GPQADiamond, LiveCodeBench, SWE Verified, and QwQ-32B-Preview.


In [46]:
# ======================================
# (N) SAVE & EXPORT 4-BIT
# ======================================
merged_16bit_dir = "qwen3b_finetuned_16bit"
try:
    model.save_pretrained_merged(
        merged_16bit_dir,
        tokenizer,
        save_method="merged_16bit",
    )
    print("[INFO] 16-bit merged. Now exporting 4-bit...")

    model.save_pretrained_gguf(
        merged_16bit_dir,
        tokenizer=tokenizer,
        quantization_method="q4_k_m",
    )
    print("✅ 4-bit GGUF exported!")
except Exception as e:
    print(f"❌ Could not export the model: {e}")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.26 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:01<00:00, 31.57it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving qwen3b_finetuned_16bit/pytorch_model-00001-of-00002.bin...
Unsloth: Saving qwen3b_finetuned_16bit/pytorch_model-00002-of-00002.bin...
Done.
[INFO] 16-bit merged. Now exporting 4-bit...
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.67 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:01<00:00, 25.88it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving qwen3b_finetuned_16bit/pytorch_model-00001-of-00002.bin...
Unsloth: Saving qwen3b_finetuned_16bit/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at qwen3b_finetuned_16bit into f16 GGUF format.
The output location will be /content/qwen3b_finetuned_16bit/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: qwen3b_finetuned_16bit
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model

In [55]:
# ======================================
# (O) Llama-CPP Inference
# ======================================
model_path = "/content/qwen3b_finetuned_16bit/unsloth.Q4_K_M.gguf"
try:
    llm = Llama(model_path=model_path, n_ctx=2048)
    prompt = "Tell me about DeepSeek-R1, we directly fine-tuned open-source models like Qwen (Qwen, 2024b) and Llama."
    output = llm(prompt, max_tokens=200, stop=["\n"], echo=True)
    print("\n[LLAMA-CPP OUTPUT]")
    print(output["choices"][0]["text"])
except Exception as e:
    print(f"❌ Could not run inference with llama-cpp: {e}")

print("""
✅ All done!
This code uses only FAISS + BM25 for retrieval, no ColBERT references.
Enjoy your Qwen2.5 3B pipeline with multi-hop QA, SciQA, ORPO RL,
and 4-bit export!
""")

llama_model_loader: loaded meta data with 26 key-value pairs and 434 tensors from /content/qwen3b_finetuned_16bit/unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3b Instruct Unsloth Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = instruct-unsloth-bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = qwen2.5
llama_model_loader: - kv   6:                         general.size_label str              = 3B
lla


[LLAMA-CPP OUTPUT]
Tell me about DeepSeek-R1, we directly fine-tuned open-source models like Qwen (Qwen, 2024b) and Llama. The DeepSeek-R1 model is a language model that was fine-tuned on the DeepSeek dataset, which is a large-scale dataset of Chinese text. The model was developed by a team of researchers at the DeepSeek research institute. 

✅ All done!
This code uses only FAISS + BM25 for retrieval, no ColBERT references.
Enjoy your Qwen2.5 3B pipeline with multi-hop QA, SciQA, ORPO RL,
and 4-bit export!

