In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct", 
                                           device_map="auto",
                                           trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct", 
                                        trust_remote_code=True)


Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [10]:
import os

# Define the path to your database folder
database_folder = "q3_dataset"  # Change this to your actual folder path

# Check if the directory exists
if not os.path.exists(database_folder):
    os.makedirs(database_folder)
    print(f"Created directory: {database_folder}")
else:
    print(f"Using existing directory: {database_folder}")

# Get list of files in the directory
uploaded = {}
for file in os.listdir(database_folder):
    file_path = os.path.join(database_folder, file)
    if os.path.isfile(file_path):
        # Use file name as key and path as value
        uploaded[file] = file_path
        
print(f"Found {len(uploaded)} files in the database folder")


Using existing directory: q3_dataset
Found 6 files in the database folder


In [11]:
from markdown_it import MarkdownIt

# Function to read Markdown files
def read_md(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        content = f.read()
    md = MarkdownIt()
    tokens = md.parse(content)
    return content

# Read all Markdown files
md_files = [f for f in uploaded.keys() if f.endswith('.md')]
md_texts = [read_md(uploaded[file]) for file in md_files]


In [12]:
%pip install pypdf -q

from pypdf import PdfReader

# Function to read PDF files
def read_pdf(file_name):
    reader = PdfReader(file_name)
    text = ''
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

# Read all PDF files
pdf_files = [f for f in uploaded.keys() if f.endswith('.pdf')]
pdf_texts = [read_pdf(uploaded[file]) for file in pdf_files]


Note: you may need to restart the kernel to use updated packages.


In [13]:
all_text = "\n\n".join(md_texts + pdf_texts)

In [14]:
import re

# Split by headings starting with '#'
data = re.split(r'#\s+', all_text)
qa_pairs = []

# Skip the first empty split (if any)
for i in range(1, len(data) - 1, 2):
    question = data[i].strip()
    answer = data[i + 1].strip()
    qa_pairs.append({'question': question, 'answer': answer})
    
print("qa_pairs:", qa_pairs)

qa_pairs: [{'question': 'DualPipe\nDualPipe is an innovative bidirectional pipeline parallelism algorithm introduced in the DeepSeek-V3 Technical Report. It achieves full overlap of forward and backward computation-communication phases, also reducing pipeline bubbles. For detailed information on computation-communication overlap, please refer to the profile data.\n\nPipeline Bubbles and Memory Usage Comparison\n\n| Method    | Bubble                  | Parameter | Activation |\n|:---------:|:-----------------------:|:---------:|:----------:|\n| 1F1B      | (PP-1)(𝐹+𝐵)            | 1×        | PP         |\n| ZB1P      | (PP-1)(𝐹+𝐵-2𝑊)         | 1×        | PP         |\n| DualPipe  | (PP/2-1)(𝐹&𝐵+𝐵-3𝑊)     | 2×        | PP+1       |\n\n𝐹 denotes the execution time of a forward chunk, 𝐵 denotes the execution time of a full backward chunk, 𝑊 denotes the execution time of a "backward for weights" chunk, and 𝐹&𝐵 denotes the execution time of two mutually overlapped forward and backward chu

In [15]:
from datasets import Dataset

dataset = Dataset.from_list(qa_pairs)

print(dataset[0])

{'question': 'DualPipe\nDualPipe is an innovative bidirectional pipeline parallelism algorithm introduced in the DeepSeek-V3 Technical Report. It achieves full overlap of forward and backward computation-communication phases, also reducing pipeline bubbles. For detailed information on computation-communication overlap, please refer to the profile data.\n\nPipeline Bubbles and Memory Usage Comparison\n\n| Method    | Bubble                  | Parameter | Activation |\n|:---------:|:-----------------------:|:---------:|:----------:|\n| 1F1B      | (PP-1)(𝐹+𝐵)            | 1×        | PP         |\n| ZB1P      | (PP-1)(𝐹+𝐵-2𝑊)         | 1×        | PP         |\n| DualPipe  | (PP/2-1)(𝐹&𝐵+𝐵-3𝑊)     | 2×        | PP+1       |\n\n𝐹 denotes the execution time of a forward chunk, 𝐵 denotes the execution time of a full backward chunk, 𝑊 denotes the execution time of a "backward for weights" chunk, and 𝐹&𝐵 denotes the execution time of two mutually overlapped forward and backward chunks.\n\n##'

In [16]:
data = dataset.train_test_split(test_size=0.1)
train_data = data['train']
test_data = data['test']

In [17]:
def format_data(example):
    question = example['question']
    answer = example['answer']
    inputs = tokenizer(question, padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(answer, padding="max_length", truncation=True, max_length=512)['input_ids']

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels
    }

train_data = train_data.map(format_data)
test_data = test_data.map(format_data)

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [18]:
# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples['question'], padding="max_length", truncation=True)

# Apply tokenization to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Print the tokenized dataset columns
print(tokenized_datasets.column_names)

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

['question', 'answer', 'input_ids', 'attention_mask']


In [19]:
# Title: Custom LoRA Implementation for Transformer Models with Training and Saving LoRA Weights

# ---------------------------------------------------------
# Part 1: Import Libraries
# ---------------------------------------------------------
import torch
import torch.nn as nn
import copy
import math
from collections import OrderedDict
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from huggingface_hub import login

# ---------------------------------------------------------
# Part 2: Define the LoRA Layer Class (SimpleLoRALayer)
# ---------------------------------------------------------
class SimpleLoRALayer(nn.Module):
    """Ultra-minimal LoRA implementation avoiding complex dependencies."""
    
    def __init__(self, original_module, rank=8, alpha=16, dropout=0.1):
        super().__init__()
        self.original_module = original_module
        self.rank = rank
        self.alpha = alpha
        
        # Get dimensions
        in_features = original_module.in_features
        out_features = original_module.out_features
        
        # Create A and B matrices
        self.lora_A = nn.Linear(in_features, rank, bias=False)
        self.lora_B = nn.Linear(rank, out_features, bias=False)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        nn.init.normal_(self.lora_A.weight, std=1/rank)
        nn.init.zeros_(self.lora_B.weight)
        
        # Freeze original weights
        original_module.weight.requires_grad = False
        if hasattr(original_module, 'bias') and original_module.bias is not None:
            original_module.bias.requires_grad = False
        
        # Move LoRA layers to the same device as the original module
        device = original_module.weight.device
        self.lora_A = self.lora_A.to(device)
        self.lora_B = self.lora_B.to(device)
    
    def forward(self, x):
        # Original forward pass
        original_output = self.original_module(x)
        
        # LoRA path - ensure same device
        lora_output = self.lora_B(self.lora_A(self.dropout(x)))
        
        # Combine outputs
        return original_output + (lora_output * (self.alpha / self.rank))


In [20]:
# ---------------------------------------------------------
# Part 3: Function to Add LoRA Layers to a Model
# ---------------------------------------------------------
def add_lora_to_model(model, target_module_names=None):
    """Apply LoRA layers to target modules."""
    if target_module_names is None:
        # Default targets if none provided
        target_module_names = ["q_proj", "v_proj"]
    
    # Keep track of replaced modules
    modified_modules = {}
    
    # Process each module in the model
    for name, module in list(model.named_modules()):
        # Check if this module should be modified
        if isinstance(module, nn.Linear) and any(target in name for target in target_module_names):
            # Find parent module to replace this submodule
            parent_name = name.rsplit('.', 1)[0] if '.' in name else ''
            child_name = name.rsplit('.', 1)[1] if '.' in name else name
            
            # Get parent module
            parent_module = model
            if parent_name:
                for part in parent_name.split('.'):
                    parent_module = getattr(parent_module, part)
            
            # Create LoRA layer and replace
            lora_layer = SimpleLoRALayer(module, rank=16, alpha=32, dropout=0.1)
            setattr(parent_module, child_name, lora_layer)
            
            # Store for tracking
            modified_modules[name] = lora_layer
            print(f"Added LoRA to: {name}")
    
    # Print summary
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Modified {len(modified_modules)} modules")
    print(f"Trainable parameters: {trainable_params} ({trainable_params/total_params*100:.2f}%)")
    
    return model


In [21]:
# ---------------------------------------------------------
# Part 4: Custom Dataset Class for the Quotes Dataset
# ---------------------------------------------------------
class QuotesDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=128):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item["quote"]
        
        # Tokenize
        tokens = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Prepare inputs and labels for causal LM
        input_ids = tokens.input_ids.squeeze()
        labels = input_ids.clone()
        
        # For CLM training, we need to prepare inputs and targets
        # Typically, we shift the labels by 1 so that the model predicts the next token
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": tokens.attention_mask.squeeze()
        }


In [22]:
# ---------------------------------------------------------
# Part 5: Training Function Using Adam Optimizer
# ---------------------------------------------------------
def train_with_sgd(model, dataloader, device="cuda", epochs=1, lr=0.0001, gradient_accumulation_steps=1):
    """
    Train the model using SGD with gradient accumulation to save memory.
    
    Args:
        model: The model to train
        dataloader: DataLoader containing the training data
        device: Device to train on
        epochs: Number of epochs to train for
        lr: Learning rate
        gradient_accumulation_steps: Number of steps to accumulate gradients over
    """
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    model.train()
    
    total_steps = len(dataloader) * epochs
    print(f"Total training steps: {total_steps}")
    
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()  # Zero gradients at the beginning of each epoch
        
        for step, batch in enumerate(dataloader):
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device) if "labels" in batch else input_ids.clone()
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # Scale the loss according to gradient accumulation steps
            loss = outputs.loss / gradient_accumulation_steps
            total_loss += loss.item() * gradient_accumulation_steps
            
            # Backward pass
            loss.backward()
            
            # Update weights only after accumulating gradients for specified steps
            if (step + 1) % gradient_accumulation_steps == 0 or step == len(dataloader) - 1:
                optimizer.step()
                optimizer.zero_grad()
                
                # Print progress
                if (step + 1) % (gradient_accumulation_steps * 10) == 0:
                    avg_loss = total_loss / (step + 1)
                    print(f"Epoch {epoch+1}/{epochs} | Step {step+1}/{len(dataloader)} | Loss: {avg_loss:.4f}")
        
        # Print epoch results
        avg_epoch_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} completed | Avg Loss: {avg_epoch_loss:.4f}")
    
    return model

In [23]:
# ---------------------------------------------------------
# Part 6: Function to Save Only the LoRA Parameters
# ---------------------------------------------------------
def save_lora_state(model, path):
    lora_state = OrderedDict()
    for name, module in model.named_modules():
        if isinstance(module, SimpleLoRALayer):
            # Save A and B matrices
            lora_state[f"{name}.lora_A.weight"] = module.lora_A.weight.data
            lora_state[f"{name}.lora_B.weight"] = module.lora_B.weight.data
    
    torch.save(lora_state, path)
    print(f"Saved LoRA weights to {path}")


In [24]:
# ---------------------------------------------------------
# Part 7: Main Execution Function
# ---------------------------------------------------------
import torch

# Clear the GPU memory cache
torch.cuda.empty_cache()
import os
# Set environment variable for better memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

def main():
    # Authenticate with Hugging Face
    login(token="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")
    
    # Initialize model and tokenizer
    model_id = "Qwen/Qwen2.5-3B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

    # Load model with memory optimizations
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Use half precision to save memory
    dtype = torch.float32  # Changed from float32 to float32
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,  # Added for memory optimization
    ).to(device)
    
    # Enable gradient checkpointing to save memory
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_enable()

    # Apply custom LoRA to the model
    lora_model = add_lora_to_model(
        model,
        target_module_names=["q_proj", "k_proj", "v_proj", "out_proj"]
    )

    # Load dataset - use an even smaller subset
    dataset = load_dataset("Abirate/english_quotes")
    train_data = dataset["train"].train_test_split(test_size=0.95)["train"]  # Using only 5% of data
    val_data = dataset["train"].train_test_split(test_size=0.1)["test"].select(range(min(50, len(dataset["train"].train_test_split(test_size=0.1)["test"]))))
    
    # Create custom datasets with smaller sequence length
    train_dataset = QuotesDataset(train_data, tokenizer, max_length=64)  # Reduced from 128
    val_dataset = QuotesDataset(val_data, tokenizer, max_length=64)
    
    # Create dataloaders with smaller batch size
    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # Reduced from 4
    val_dataloader = DataLoader(val_dataset, batch_size=1)
    
    # Train with our modified function that supports gradient accumulation
    print(f"Training on {device}...")
    train_with_sgd(
        lora_model, 
        train_dataloader, 
        device=device, 
        epochs=1, 
        lr=0.0001,
        gradient_accumulation_steps=4  # Accumulate gradients over 4 steps (effective batch size = 4)
    )
    
    # Save LoRA weights
    save_lora_state(lora_model, "lora_weights.pt")


if __name__ == "__main__":
    main()

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.77 GiB is allocated by PyTorch, and 1.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
"""
Step 1: Merge your LoRA weights with the base model
This script merges your trained LoRA weights with the base model
"""

import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from collections import OrderedDict

def apply_lora_weights(base_model, lora_weights_path):
    """Applies the saved LoRA weights to the base model"""
    print(f"Loading LoRA weights from {lora_weights_path}")
    lora_state = torch.load(lora_weights_path, map_location='cpu')
    
    # Track which modules have A and B matrices
    lora_modules = {}
    
    # First pass: identify all modules with LoRA weights
    for lora_key in lora_state.keys():
        module_path = lora_key.rsplit(".", 2)[0]  # Get module path without lora_A/B.weight
        if module_path not in lora_modules:
            lora_modules[module_path] = {'A': None, 'B': None}
            
        if ".lora_A.weight" in lora_key:
            lora_modules[module_path]['A'] = lora_state[lora_key]
        elif ".lora_B.weight" in lora_key:
            lora_modules[module_path]['B'] = lora_state[lora_key]
    
    # Second pass: apply merged weights
    for module_path, matrices in lora_modules.items():
        if matrices['A'] is None or matrices['B'] is None:
            print(f"Incomplete LoRA matrices for {module_path}")
            continue
        
        # Get the base module name 
        base_module_name = module_path
        
        # Navigate to the module
        try:
            parts = base_module_name.split('.')
            module = base_model
            for part in parts:
                if part:
                    module = getattr(module, part)
            
            # Access the original_module for SimpleLoRALayer
            if hasattr(module, 'original_module'):
                module = module.original_module
                
            # This is now the linear layer we want to modify
            if hasattr(module, 'weight'):
                # Compute LoRA weight adjustment: B·A
                a_matrix = matrices['A']
                b_matrix = matrices['B']
                scaling = 16 / 8  # alpha/rank
                
                # Compute the delta and add to base weights
                delta = torch.matmul(b_matrix, a_matrix) * scaling
                
                # Add to the model weights
                with torch.no_grad():
                    module.weight.copy_(module.weight + delta)
                
                print(f"Applied LoRA weights to {base_module_name}")
        except (AttributeError, KeyError) as e:
            print(f"Error applying LoRA weights to {base_module_name}: {e}")
    
    return base_model

def export_to_huggingface_format(model, tokenizer, output_dir):
    """Export the model to HuggingFace format for conversion"""
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Saving model to {output_dir}")
    model.save_pretrained(output_dir, safe_serialization=True)
    tokenizer.save_pretrained(output_dir)
    
    return output_dir

def main():
    """Merge LoRA weights and export model"""
    # Authenticate with Hugging Face if needed
    # login(token="your_token_here")  # Uncomment if needed
    
    # Load the base model
    model_id = "Qwen/Qwen2.5-1B-Instruct"
    print(f"Loading base model {model_id}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,  # Load in float16 for efficiency
        device_map="auto"
    )
    
    # Apply LoRA weights
    lora_weights_path = "lora_weights.pt"
    if os.path.exists(lora_weights_path):
        model = apply_lora_weights(model, lora_weights_path)
    else:
        print(f"LoRA weights file {lora_weights_path} not found. Using base model.")
    
    # Export to HuggingFace format
    output_dir = "merged_model"
    export_to_huggingface_format(model, tokenizer, output_dir)
    
    print("Model successfully merged and exported to 'merged_model' directory.")
    print("Now follow the manual conversion steps in the instructions.")

if __name__ == "__main__":
    main()

Loading base model Qwen/Qwen2.5-1B-Instruct


OSError: Qwen/Qwen2.5-1B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from huggingface_hub import login

def load_lora_state(model, path):
    lora_state = torch.load(path)
    
    # Keep track of successfully loaded parameters
    loaded_params = 0
    total_params = len(lora_state)
    
    for name, param in lora_state.items():
        # Split the parameter name to find the corresponding module
        parts = name.split('.')
        module_name = '.'.join(parts[:-2])  # Get parent module name
        matrix_type = parts[-2]  # 'lora_A' or 'lora_B'
        
        # Find the module in the model
        module = model
        found = True
        for part in module_name.split('.'):
            if hasattr(module, part):
                module = getattr(module, part)
            else:
                print(f"Warning: Could not find part '{part}' in module path '{module_name}'")
                found = False
                break
        
        if not found:
            continue
            
        # Check if this module has LoRA attributes
        if hasattr(module, 'lora_A') and hasattr(module, 'lora_B'):
            if matrix_type == 'lora_A':
                module.lora_A.weight.data = param
                loaded_params += 1
            elif matrix_type == 'lora_B':
                module.lora_B.weight.data = param
                loaded_params += 1
        else:
            print(f"Warning: Module '{module_name}' doesn't have proper LoRA attributes")
    
    print(f"Successfully loaded {loaded_params}/{total_params} LoRA parameters")
    
    # Make sure LoRA modules are in eval mode
    model.eval()
    return model

def generate_answer(model, tokenizer, question, max_length=50):
    model.eval()
    
    # Format the input with a prompt
    prompt = f"Question: {question}\nAnswer:"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=False,  # Changed from max_length padding
        truncation=True
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_k=50,
            top_p=0.95,  # Add nucleus sampling
            repetition_penalty=1.2,  # Add repetition penalty to avoid loops
            no_repeat_ngram_size=3,  # Prevent repeating 3-grams
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Get only the newly generated tokens
    answer_tokens = outputs[0][inputs.input_ids.shape[1]:]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    
    return answer.strip()

# Example usage
def test_model():
    # Login to Hugging Face Hub with your token
    login(token="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")
    
    # Load base model with the token
    model_id = "facebook/opt-350m"
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
    model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=True).to("cuda")
    
    # Add LoRA layers
    lora_model = add_lora_to_model(model)
    
    # Load trained LoRA weights
    load_lora_state(lora_model, "lora_weights.pt")
    
    # Test questions
    questions = [
        "post-training has emerged as an important component of?"
    ]
    
    for question in questions:
        answer = generate_answer(lora_model, tokenizer, question)
        print(f"Question: {question}")
        print(f"Answer: {answer}\n")

# Add this to the main execution
if __name__ == "__main__":
    # main()  # Comment this out after training
    test_model()

HTTPError: Invalid user token. If you didn't pass a user token, make sure you are properly logged in by executing `huggingface-cli login`, and if you did pass a user token, double-check it's correct.