# 1. Setup Environment


In [None]:
! pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
! pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 1. Load and Prepare Training model


In [None]:
from unsloth import FastLanguageModel
import torch
from google.colab import userdata

max_seq_length = 2048
dtype = None
load_in_4bit=True

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token = userdata.get('HUGGINGFACE_TOKEN')
)

# 2. Prepare the data set

In [None]:
from IPython.display import clear_output

In [None]:
print("Please upload research documents (.md/.pdf):")
from google.colab import files
uploaded = files.upload()

In [None]:
print(uploaded)

In [None]:
! pip install -qU langchain_community pypdf

In [None]:
! pip install -qU langchain-text-splitters

In [None]:
! pip install -q pypdf langchain
! pip install -U langchain-community

In [None]:
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
def process_uploaded_files(uploaded_files):
    documents = []
    for filename in uploaded_files:
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(filename)
        else:
            loader = TextLoader(filename)
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200
        )
        documents.extend(text_splitter.split_documents(docs))
    return documents

processed_docs = process_uploaded_files(uploaded.keys())

In [None]:
print(processed_docs)

In [None]:
import re
import json
from transformers import pipeline
from datasets import Dataset

# Import loaders and splitter
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def clean_markdown(text):
    """
    Remove common markdown formatting from text.
    """
    # Remove markdown headers (e.g., # Header, ## Subheader)
    text = re.sub(r'#+\s+', '', text)
    # Remove tables
    text = re.sub(r'\|.*\|\n(\|[-:]+)+\n', '', text)  # Removes table headers
    text = re.sub(r'\|.*\|\n', '', text)  # Removes table rows
    # Remove inline code formatting (`code`)
    text = re.sub(r'`([^`]*)`', r'\1', text)
    # Remove multiline code blocks (```)
    text = re.sub(r'```[\s\S]*?```', '', text)
    # Remove extra newlines
    text = re.sub(r'\n{2,}', '\n', text)
    # Remove bullet points (-, *, etc.)
    text = re.sub(r'^\s*[-*]\s+', '', text, flags=re.MULTILINE)
    return text.strip()

def process_uploaded_files(uploaded_files):
    """
    Load documents from a list of file names, clean markdown content if needed,
    and split them into smaller chunks.
    """
    documents = []
    for filename in uploaded_files:
        # Choose the appropriate loader based on file extension
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(filename)
        else:
            loader = TextLoader(filename)
        docs = loader.load()

        # If file is markdown, clean its content
        if filename.endswith('.md'):
            for doc in docs:
                doc.page_content = clean_markdown(doc.page_content)

        # Split documents into smaller chunks for processing
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200
        )
        documents.extend(text_splitter.split_documents(docs))
    return documents

# 'uploaded' is assumed to be a dictionary-like object with filenames as keys.
processed_docs = process_uploaded_files(uploaded.keys())
print(processed_docs)


In [None]:
from datasets import Dataset
import random
from sklearn.model_selection import train_test_split

def generate_qa_pairs(text_chunks):
    qa_pairs = []
    for chunk in text_chunks:
        # Simple synthetic Q&A generation (enhance with LLM in production)
        questions = [
            ("What is the main purpose of 3FS?", "3FS is a high-performance distributed file system designed for AI workloads."),
            ("How does DeepSeek-V3 achieve efficient training?", "Through DualPipe pipeline parallelism and FP8 mixed precision training.")
        ]
        qa_pairs.extend(questions)
    return qa_pairs

# Load and parse provided documents
text_chunks = []
for file in ["design-notes-3fs.md", "deepseekv3-explained.md"]:
    with open(file, "r") as f:
        text = f.read()
        text_chunks.extend(text.split("\n\n"))  # Simple chunking

# Generate synthetic dataset
qa_pairs = generate_qa_pairs(text_chunks)
dataset = Dataset.from_dict({
    "question": [q for q, a in qa_pairs],
    "answer": [a for q, a in qa_pairs]
})
train_test = dataset.train_test_split(test_size=0.2)
train_data = train_test["train"]
val_data = train_test["test"]

In [None]:
print(val_data.to_pandas())

# 3. Fine-tune with QLoRA

In [None]:
! pip install -q peft
! pip install -q bitsandbytes
! pip install -q transformers accelerate

In [None]:
import torch

# Check if CUDA is available and move the model to GPU
if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to CUDA.")
else:
    raise EnvironmentError("CUDA is not available. Please ensure you have a CUDA-enabled GPU.")

# torch.set_default_tensor_type(torch.cuda.FloatTensor)

In [None]:
! export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
! pip install --upgrade transformers


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
x = os.environ.get("PYTORCH_CUDA_ALLOC_CONF")
print(x)

expandable_segments:True


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer
from peft import LoraConfig, get_peft_model
import torch

model_id = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# LoRA Configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Fixed Training Arguments with data seeding and disabled generation during evaluation
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="epoch",
    remove_unused_columns=False,
    seed=42,           # Explicit seeding
    data_seed=42,      # Data-specific seeding
    dataloader_pin_memory=False,  # Disable generation during evaluation to prevent CUDA generator errors
)

# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

def format_qa_prompt(example):
    prompt = f"<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n{example['answer']}<|im_end|>"
    # Tokenize the prompt to get input_ids and attention_mask
    encoding = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length
    )
    # For causal LM training, you can set labels equal to input_ids
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding


train_dataset = train_data.map(format_qa_prompt, remove_columns=train_data.column_names)
val_dataset = val_data.map(format_qa_prompt, remove_columns=val_data.column_names)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Ensure we call the function to check CUDA availability
if torch.cuda.is_available():
    trainer.train()
else:
    raise EnvironmentError("CUDA is not available. Please check your GPU configuration.")


In [None]:
from transformers import pipeline
qa_generator = pipeline('text2text-generation', model='microsoft/prophetnet-large-uncased')

def generate_qa_pairs(text):
    return qa_generator(
        f"Generate a question and answer from this text: {text}",
        max_length=256,
        num_return_sequences=2
    )


In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
! pip uninstall -y bitsandbytes
! pip install -U bitsandbytes

In [None]:
model_id = "Qwen/Qwen2.5-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True
)

In [None]:
# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
! pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig
from trl import SFTTrainer
import torch
from transformers import BitsAndBytesConfig

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "w1", "w2"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

# Load model
tokenizer = AutoTokenizer.from_pretrained(model_id)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  # Added nested quantization
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    max_grad_norm=0.3,
    num_train_epochs=2,
    fp16=True,
    logging_steps=10,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    formatting_func=lambda x: f"<|im_start|>user\n{x['question']}<|im_end|>\n<|im_start|>assistant\n{x['answer']}<|im_end|>"
)

# Start training
trainer.train()
trainer.save_model("/content/finetuned_qwen")

In [None]:
from evaluate import load
import numpy as np

rouge = load("rouge")

def evaluate(model_path):
    llm = load_model()
    test_questions = [ex["question"] for ex in val_data]
    predictions = [query_model(q, llm) for q in test_questions]
    references = [ex["answer"] for ex in val_data]

    rouge_score = rouge.compute(
        predictions=predictions,
        references=references,
        rouge_types=["rougeL"]
    )

    return {
        "rougeL": round(rouge_score["rougeL"], 3),
        "exact_match": np.mean([p.strip() == r.strip() for p, r in zip(predictions, references)])
    }

evaluation_results = evaluate("./qwen-3b-finetuned-Q4.gguf")
print(f"Evaluation Results: {evaluation_results}")