# Install All Required Dependencies

In [1]:
!pip install transformers bitsandbytes accelerate peft pandas torch scikit-learn numpy tqdm -q
!pip install -U -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependen

#  Experiment 1: Tune LoRA for Higher Capacity
This experiment uses cross-validation to fine-tune the model with an increased LoRA rank and alpha.

In [2]:
%%writefile train_s1_lora_capacity.py

# STRATEGY 1: Fine-tuning with a higher LoRA capacity.
# - r = 16, lora_alpha = 32
# - LoRA targets all linear layers.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# Imports for Multi-GPU Distributed Data Parallel (DDP)
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_cosine_schedule_with_warmup, BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Constants ---
MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
NUM_FOLDS = 3
NUM_EPOCHS = 5
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup (Global) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        return self.prompts[idx], self.targets[idx]

    def __len__(self):
        return len(self.targets)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        self.backbone = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=bnb_config,
            device_map=rank,
            torch_dtype=torch.bfloat16,
            use_cache=False,
        )

        # OPTIMIZATION: Increased LoRA capacity and targeting all linear layers
        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            bias='none',
            inference_mode=False,
            r=16,
            lora_alpha=32,
            lora_dropout=0.05
        )
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
        return self.head(last_hidden_state)

# --- DDP Setup function ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Main Training Function (for each process) ---
def train_process(rank, world_size, fold, train_index, val_index, all_prompts, all_targets):
    ddp_setup(rank, world_size)
    if rank == 0:
        print(f"--- Starting Fold {fold+1}/{NUM_FOLDS} on {world_size} GPUs ---")

    train_prompts = [all_prompts[i] for i in train_index]
    val_prompts = [all_prompts[i] for i in val_index]
    train_targets = [all_targets[i] for i in train_index]
    val_targets = [all_targets[i] for i in val_index]

    class_weights = 1 / (np.unique(train_targets, return_counts=True)[1] / len(train_targets))
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(rank)

    train_dataset = MathDataset(train_prompts, train_targets)
    val_dataset = MathDataset(val_prompts, val_targets)

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)

    val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)

    model = Net(MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
        
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer, num_warmup_steps=0,
        num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS
    )
    scaler = torch.amp.GradScaler('cuda')
    best_f1 = 0.0

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()

        if rank == 0:
            print(f"  Epoch {epoch+1}/{NUM_EPOCHS}")
            pbar = tqdm(train_loader, desc="Training")
        else:
            pbar = train_loader

        for step, (batch_prompts, batch_targets) in enumerate(pbar):
            encodings = tokenizer(
                batch_prompts, return_tensors='pt', padding='max_length',
                truncation=True, max_length=MAX_LEN
            ).to(rank)
            batch_targets = batch_targets.long().to(rank)

            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets, weight=class_weights)
                loss = loss / GRAD_ACCUM_STEPS

            scaler.scale(loss).backward()

            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
        
        if rank == 0:
            model.eval()
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch_prompts, batch_targets in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(
                        batch_prompts, return_tensors='pt', padding='max_length',
                        truncation=True, max_length=MAX_LEN
                    ).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
                    all_labels.extend(batch_targets.tolist())

            f1 = f1_score(all_labels, all_preds, average='micro')
            print(f'  Fold {fold+1} | Epoch {epoch+1} | Validation F1-micro: {f1:.4f}')

            if f1 > best_f1:
                best_f1 = f1
                print(f"  New best F1 score: {best_f1:.4f}. Saving model...")
                model.module.backbone.save_pretrained(f'qwen2_s1_backbone_fold_{fold}_best')
                torch.save(model.module.head.state_dict(), f'qwen2_s1_head_fold_{fold}_best.pt')

    destroy_process_group()

# --- Launcher Function ---
def main():
    print("PyTorch version:", torch.__version__)
    world_size = torch.cuda.device_count()
    print(f"Using {world_size} GPUs.")

    torch.manual_seed(SEED)
    np.random.seed(SEED)

    df = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')
    df.columns = ['problem', 'target']

    prompts = [
        f"""<|im_start|>user
Your task is to classify each Math problem into one of these eight topics using a machine learning or NLP-based approach.
0: Algebra
1: Geometry and Trigonometry
2: Calculus and Analysis
3: Probability and Statistics
4: Number Theory
5: Combinatorics and Discrete Math
6: Linear Algebra
7: Abstract Algebra and Topology

Your answer should be an integer that assigns the most appropriate topic category to the given Math problem based on its content and required reasoning.

Math Problem: {p.strip()}
<|im_end|>
<|im_start|>assistant"""
        for p in df['problem']
    ]
    targets = df['target'].tolist()

    skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    splits = list(skf.split(prompts, targets))

    for fold in range(NUM_FOLDS):
        train_index, val_index = splits[fold]
        args = (world_size, fold, train_index, val_index, prompts, targets)
        mp.spawn(train_process, args=args, nprocs=world_size)

    print("\n--- All folds completed successfully! ---")

if __name__ == '__main__':
    main()

Writing train_s1_lora_capacity.py


In [3]:
!python train_s1_lora_capacity.py

2025-06-21 21:18:05.745022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750540685.971664      57 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750540686.041554      57 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
tokenizer_config.json: 100%|███████████████| 1.29k/1.29k [00:00<00:00, 9.96MB/s]
vocab.json: 100%|██████████████████████████| 2.78M/2.78M [00:00<00:00, 38.9MB/s]
merges.txt: 100%|██████████████████████████| 1.67M/1.67M [00:00<00:00, 8.56MB/s]
tokenizer.json: 100%|██████████████████████| 7.03M/7.03M [00:00<00:00, 27.0MB/s]
PyTorch version: 2.5.1+cu124
Using 4 GPUs.
2025-06-21 21:18:26.628939: E external/local_xla/xla/stream_e

# Experiment 2: Full Train Dataset on 4-bit Model
This experiment trains on the entire dataset without cross-validation using the best hyperparameters found previously.

In [4]:
%%writefile train_final_model.py

# This script trains the final model on the entire dataset using the best-performing hyperparameters.
# - LoRA r=16, lora_alpha=32, targeting all linear layers.
# - No cross-validation; uses 100% of the data for training.
# - Designed for a multi-GPU (DDP) setup.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

# Imports for Multi-GPU Distributed Data Parallel (DDP)
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_cosine_schedule_with_warmup, BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Constants ---
MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
# We train for 5 epochs as this appeared to be the sweet spot in our experiments
NUM_EPOCHS = 5
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup (Global) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        return self.prompts[idx], self.targets[idx]

    def __len__(self):
        return len(self.targets)

# --- Model Definition (Using the winning LoRA configuration) ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        self.backbone = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=bnb_config,
            device_map=rank,
            torch_dtype=torch.bfloat16,
            use_cache=False,
        )

        # Using the best-performing LoRA configuration
        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            bias='none',
            inference_mode=False,
            r=16,
            lora_alpha=32,
            lora_dropout=0.05
        )
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
        return self.head(last_hidden_state)

# --- DDP Setup function ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Main Training Function (for each process) ---
def train_process(rank, world_size, all_prompts, all_targets):
    ddp_setup(rank, world_size)
    if rank == 0:
        print(f"--- Starting final training on {world_size} GPUs ---")

    # Use the entire dataset for training
    train_dataset = MathDataset(all_prompts, all_targets)

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer, num_warmup_steps=0,
        num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS
    )
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        
        if rank == 0:
            print(f"  Epoch {epoch+1}/{NUM_EPOCHS}")
            pbar = tqdm(train_loader, desc="Training")
        else:
            pbar = train_loader

        for step, (batch_prompts, batch_targets) in enumerate(pbar):
            encodings = tokenizer(
                batch_prompts, return_tensors='pt', padding='max_length',
                truncation=True, max_length=MAX_LEN
            ).to(rank)
            batch_targets = batch_targets.long().to(rank)

            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                # We don't need class weights when training on the full balanced dataset at the end
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS

            scaler.scale(loss).backward()

            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
        
    # Save the final model only on the main process
    if rank == 0:
        print("Training complete. Saving final model...")
        model.module.backbone.save_pretrained('final_model_backbone')
        torch.save(model.module.head.state_dict(), 'final_model_head.pt')
        print("Final model saved to 'final_model_backbone/' and 'final_model_head.pt'")

    destroy_process_group()

# --- Launcher Function ---
def main():
    print("PyTorch version:", torch.__version__)
    world_size = torch.cuda.device_count()
    print(f"Using {world_size} GPUs for final training run.")

    torch.manual_seed(SEED)
    np.random.seed(SEED)

    df = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')
    df.columns = ['problem', 'target']

    prompts = [
        f"""<|im_start|>user
Your task is to classify each Math problem into one of these eight topics using a machine learning or NLP-based approach.
0: Algebra
1: Geometry and Trigonometry
2: Calculus and Analysis
3: Probability and Statistics
4: Number Theory
5: Combinatorics and Discrete Math
6: Linear Algebra
7: Abstract Algebra and Topology

Your answer should be an integer that assigns the most appropriate topic category to the given Math problem based on its content and required reasoning.

Math Problem: {p.strip()}
<|im_end|>
<|im_start|>assistant"""
        for p in df['problem']
    ]
    targets = df['target'].tolist()

    # No cross-validation split, we use all data for training
    args = (world_size, prompts, targets)
    mp.spawn(train_process, args=args, nprocs=world_size)

    print("\n--- Final model training completed successfully! ---")

if __name__ == '__main__':
    main()

Writing train_final_model.py


In [5]:
!python train_final_model.py

2025-06-21 22:14:51.918398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750544091.941072    1348 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750544091.948227    1348 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
PyTorch version: 2.5.1+cu124
Using 4 GPUs for final training run.
2025-06-21 22:15:00.343781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750544100.365968    1416 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has alre

# Splitting Data for Final Evaluation (90:10)
This step creates the training and validation sets that will be used for the remaining experiments.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# This path should be correct inside your Kaggle notebook
train_csv_path = '/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv'

try:
    df = pd.read_csv(train_csv_path)
    # It's safer to rename columns in case the header is inconsistent
    df.columns = ['problem', 'target']

    # Split the data (90% for training, 10% for validation)
    # stratify=df['target'] ensures both sets have a similar distribution of categories
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['target'])

    # Save the new files to your /kaggle/working/ directory
    train_df.to_csv('final_train_set.csv', index=False)
    val_df.to_csv('final_validation_set.csv', index=False)

    print("Successfully created 'final_train_set.csv' and 'final_validation_set.csv'")
    print(f"Final training set size: {len(train_df)} rows")
    print(f"Final validation set size: {len(val_df)} rows")

except FileNotFoundError:
    print(f"Error: Could not find the file at '{train_csv_path}'. Please double-check the path.")

Successfully created 'final_train_set.csv' and 'final_validation_set.csv'
Final training set size: 9170 rows
Final validation set size: 1019 rows


# Experiment 3: Train on 90% and Evaluate on 10% (4-bit)
This script trains on the 90% split and evaluates on the 10% hold-out set to get a final performance metric.

In [7]:
%%writefile train_and_evaluate_final.py

# This script performs the final training and evaluation.
# 1. Trains the best model config (r=16, alpha=32) on 90% of the data.
# 2. Saves the final model.
# 3. Loads the saved model and evaluates it on the 10% hold-out set.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# Imports for DDP
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

# Imports from Hugging Face Transformers
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './final_model_backbone/'
FINAL_HEAD_PATH = './final_model_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None:
            item["target"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.prompts)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        
        self.backbone = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Training Process ---
def train_process(rank, world_size, train_prompts, train_targets):
    ddp_setup(rank, world_size)
    if rank == 0: print(f"--- Starting final training on {world_size} GPUs ---")

    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Evaluation Process ---
def evaluate_model(val_prompts, val_targets):
    print("\n--- Starting Evaluation ---")
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)
    
    # Print model size
    total_params = sum(p.numel() for p in base_model.parameters())
    print(f"Loaded base model with {total_params/1_000_000_000:.2f}B parameters.")
    
    model_peft = PeftModel.from_pretrained(base_model, FINAL_BACKBONE_PATH)
    
    # Print model size after PEFT
    total_params_peft = sum(p.numel() for p in model_peft.parameters() if p.requires_grad)
    print(f"Trainable PEFT model parameters: {total_params_peft/1_000_000:.2f}M")
    
    head = nn.Linear(AutoConfig.from_pretrained(BASE_MODEL_PATH).hidden_size, 8, bias=False)
    head.load_state_dict(torch.load(FINAL_HEAD_PATH)); head.to(model_peft.device); model_peft.eval(); head.eval()

    val_dataset = MathDataset(val_prompts, val_targets)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    all_preds = []
    
    print("Running inference on validation set...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(model_peft.device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                outputs = model_peft(**encodings, output_hidden_states=True)
                logits = head(outputs.hidden_states[-1][:, -1, :])
                preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_preds.extend(preds)

    print("\n--- Final Evaluation Results ---")
    target_names = ["0: Alg", "1: Geo", "2: Calc", "3: Prob", "4: Num", "5: Combo", "6: Lin.Alg", "7: Abs.Alg"]
    print(f"\nFinal F1 Score (Micro): {f1_score(val_targets, all_preds, average='micro'):.4f}")
    print(f"Final F1 Score (Macro): {f1_score(val_targets, all_preds, average='macro'):.4f}")
    print("\nFull Classification Report:")
    print(classification_report(val_targets, all_preds, target_names=target_names))

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH)
        df_val = pd.read_csv(VAL_DATA_PATH)
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        print("You may need to run the data-splitting script first.")
        return

    df_train.columns = ['problem', 'target']; df_val.columns = ['problem', 'target']

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem into one of these eight topics...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for final training.")
        args = (world_size, train_prompts, train_targets); mp.spawn(train_process, args=args, nprocs=world_size)
        # Evaluation is done on a single GPU after training is complete
        evaluate_model(val_prompts, val_targets)
    else:
        print("No GPUs found. Please run on a GPU-enabled instance.")

if __name__ == '__main__':
    main()

Writing train_and_evaluate_final.py


In [8]:
!python train_and_evaluate_final.py

2025-06-21 22:32:12.575113: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750545132.597746    1885 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750545132.604620    1885 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for final training.
2025-06-21 22:32:20.905693: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750545140.927670    1953 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

# Experiment 4: Full Train Dataset on 8-bit Model
This experiment repeats the process from Experiment 3 but uses 8-bit quantization instead of 4-bit.

In [9]:
%%writefile train_and_evaluate_final_8bit.py

# This script performs the final training and evaluation for the 8-BIT MODEL.
# 1. Trains the best model config (r=16, alpha=32) on 90% of the data.
# 2. Saves the final model.
# 3. Loads the saved model and evaluates it on the 10% hold-out set.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# Imports for DDP
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './final_model_8bit_backbone/'
FINAL_HEAD_PATH = './final_model_8bit_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None:
            item["target"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.prompts)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        # --- KEY CHANGE: Using 8-bit quantization ---
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        
        self.backbone = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Training Process ---
def train_process(rank, world_size, train_prompts, train_targets):
    ddp_setup(rank, world_size)
    if rank == 0: print(f"--- Starting final 8-bit training on {world_size} GPUs ---")

    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
    if rank == 0:
        print("\nTraining complete. Saving final 8-bit model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final 8-bit model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Evaluation Process ---
def evaluate_model(val_prompts, val_targets):
    print("\n--- Starting Evaluation of 8-bit Model ---")
    
    # --- KEY CHANGE: Using 8-bit quantization for evaluation model ---
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    
    base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)

    # Print model size
    total_params = sum(p.numel() for p in base_model.parameters())
    print(f"Loaded base model with {total_params/1_000_000_000:.2f}B parameters.")
    
    model_peft = PeftModel.from_pretrained(base_model, FINAL_BACKBONE_PATH)

    # Print model size after PEFT
    total_params_peft = sum(p.numel() for p in model_peft.parameters() if p.requires_grad)
    print(f"Trainable PEFT model parameters: {total_params_peft/1_000_000:.2f}M")
    
    head = nn.Linear(AutoConfig.from_pretrained(BASE_MODEL_PATH).hidden_size, 8, bias=False)
    head.load_state_dict(torch.load(FINAL_HEAD_PATH)); head.to(model_peft.device); model_peft.eval(); head.eval()

    val_dataset = MathDataset(val_prompts, val_targets)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    all_preds = []
    
    print("Running inference on validation set...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(model_peft.device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                outputs = model_peft(**encodings, output_hidden_states=True)
                logits = head(outputs.hidden_states[-1][:, -1, :])
                preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_preds.extend(preds)

    print("\n--- Final 8-bit Evaluation Results ---")
    target_names = ["0: Alg", "1: Geo", "2: Calc", "3: Prob", "4: Num", "5: Combo", "6: Lin.Alg", "7: Abs.Alg"]
    print(f"\nFinal F1 Score (Micro): {f1_score(val_targets, all_preds, average='micro'):.4f}")
    print(f"Final F1 Score (Macro): {f1_score(val_targets, all_preds, average='macro'):.4f}")
    print("\nFull Classification Report:")
    print(classification_report(val_targets, all_preds, target_names=target_names))

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH)
        df_val = pd.read_csv(VAL_DATA_PATH)
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        print("You may need to run the data-splitting script first.")
        return

    df_train.columns = ['problem', 'target']; df_val.columns = ['problem', 'target']

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for final training.")
        args = (world_size, train_prompts, train_targets); mp.spawn(train_process, args=args, nprocs=world_size)
        evaluate_model(val_prompts, val_targets)
    else:
        print("No GPUs found. Please run on a GPU-enabled instance.")

if __name__ == '__main__':
    main()

Writing train_and_evaluate_final_8bit.py


In [10]:
!python train_and_evaluate_final_8bit.py

2025-06-21 22:48:19.307976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750546099.330198    2388 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750546099.336952    2388 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for final training.
2025-06-21 22:48:27.689737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750546107.711649    2456 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

# Experiment 5: Max Performance-Unquantized
This experiment aims for maximum performance by not using any quantization, increasing LoRA capacity, and using differential learning rates.

In [11]:
%%writefile train_max_performance_0.5b.py

# This script combines multiple strategies to push the 0.5B model to its performance limit.
# 1. Full Precision: The model is trained in unquantized bfloat16.
# 2. Max LoRA Capacity: r=32, lora_alpha=64, targeting all linear layers.
# 3. More Epochs: Trains for 5 epochs.
# 4. Differential LR: Uses a slower learning rate for the backbone.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# DDP Imports
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './max_perf_unquantized_backbone/'
FINAL_HEAD_PATH = './max_perf_unquantized_head.pt'

NUM_EPOCHS = 5 # STRATEGY: Increased to 5
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None:
            item["target"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.prompts)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        # STRATEGY: No quantization, load in full bfloat16 precision
        self.backbone = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map=rank,
            torch_dtype=torch.bfloat16,
            use_cache=False
        )
        
        # STRATEGY: Max LoRA capacity
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=32, lora_alpha=64, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Training Process ---
def train_process(rank, world_size, train_prompts, train_targets):
    ddp_setup(rank, world_size)
    if rank == 0: print(f"--- Starting Max Performance training on {world_size} GPUs ---")

    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    # STRATEGY: Differential learning rates
    optimizer_grouped_parameters = [
        {"params": [p for n, p in model.module.named_parameters() if "backbone" in n], "lr": 5e-5},
        {"params": [p for n, p in model.module.named_parameters() if "head" in n], "lr": 3e-4},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Evaluation Process ---
def evaluate_model(val_prompts, val_targets):
    print("\n--- Starting Evaluation of Unquantized Model ---")
    
    # Load the unquantized base model and apply trained adapters
    base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)

    # Print model size
    total_params = sum(p.numel() for p in base_model.parameters())
    print(f"Loaded base model with {total_params/1_000_000_000:.2f}B parameters.")
    
    model_peft = PeftModel.from_pretrained(base_model, FINAL_BACKBONE_PATH)

    # Print model size after PEFT
    total_params_peft = sum(p.numel() for p in model_peft.parameters() if p.requires_grad)
    print(f"Trainable PEFT model parameters: {total_params_peft/1_000_000:.2f}M")
    
    head = nn.Linear(AutoConfig.from_pretrained(BASE_MODEL_PATH).hidden_size, 8, bias=False)
    head.load_state_dict(torch.load(FINAL_HEAD_PATH)); head.to(model_peft.device); model_peft.eval(); head.eval()

    val_dataset = MathDataset(val_prompts, val_targets)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    all_preds = []
    
    print("Running inference on validation set...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(model_peft.device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                outputs = model_peft(**encodings, output_hidden_states=True)
                logits = head(outputs.hidden_states[-1][:, -1, :])
                preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_preds.extend(preds)

    print("\n--- Final Evaluation Results ---")
    target_names = ["0: Alg", "1: Geo", "2: Calc", "3: Prob", "4: Num", "5: Combo", "6: Lin.Alg", "7: Abs.Alg"]
    print(f"\nFinal F1 Score (Micro): {f1_score(val_targets, all_preds, average='micro'):.4f}")
    print(f"Final F1 Score (Macro): {f1_score(val_targets, all_preds, average='macro'):.4f}")
    print("\nFull Classification Report:")
    print(classification_report(val_targets, all_preds, target_names=target_names))

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH)
        df_val = pd.read_csv(VAL_DATA_PATH)
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        return

    df_train.columns = ['problem', 'target']; df_val.columns = ['problem', 'target']

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for final training.")
        args = (world_size, train_prompts, train_targets); mp.spawn(train_process, args=args, nprocs=world_size)
        evaluate_model(val_prompts, val_targets)
    else:
        print("No GPUs found. Please run on a GPU-enabled instance.")

if __name__ == '__main__':
    main()

Writing train_max_performance_0.5b.py


In [12]:
!python train_max_performance_0.5b.py

2025-06-21 23:09:32.684389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750547372.706225    2891 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750547372.712873    2891 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for final training.
2025-06-21 23:09:41.024457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750547381.046964    2959 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

# Experiment 6: "Low and Slow" Maximum Performance (4-bit)
This approach uses a slower learning rate and higher LoRA capacity with 4-bit quantization to potentially achieve better results.

In [13]:
%%writefile train_low_and_slow_4bit.py

# STRATEGY: Maximize 4-bit model performance with a "low and slow" approach.
# 1. Quantization: 4-bit
# 2. Max LoRA Capacity: r=32, lora_alpha=64.
# 3. More Epochs: Trains for 5 epochs.
# 4. Lower Learning Rate: Uses a smaller learning rate of 5e-5.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# DDP Imports
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './low_slow_4bit_backbone/'
FINAL_HEAD_PATH = './low_slow_4bit_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None:
            item["target"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.prompts)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        
        self.backbone = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=32, lora_alpha=64, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Training Process ---
def train_process(rank, world_size, train_prompts, train_targets):
    ddp_setup(rank, world_size)
    if rank == 0: print(f"--- Starting 'Low and Slow' 4-bit training on {world_size} GPUs ---")

    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
    if rank == 0:
        print("\nTraining complete. Saving final 4-bit model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final 4-bit model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Evaluation Process ---
def evaluate_model(val_prompts, val_targets):
    print("\n--- Starting Evaluation of 4-bit Model ---")
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)

    # Print model size
    total_params = sum(p.numel() for p in base_model.parameters())
    print(f"Loaded base model with {total_params/1_000_000_000:.2f}B parameters.")
    
    model_peft = PeftModel.from_pretrained(base_model, FINAL_BACKBONE_PATH)

    # Print model size after PEFT
    total_params_peft = sum(p.numel() for p in model_peft.parameters() if p.requires_grad)
    print(f"Trainable PEFT model parameters: {total_params_peft/1_000_000:.2f}M")
    
    head = nn.Linear(AutoConfig.from_pretrained(BASE_MODEL_PATH).hidden_size, 8, bias=False)
    head.load_state_dict(torch.load(FINAL_HEAD_PATH)); head.to(model_peft.device); model_peft.eval(); head.eval()

    val_dataset = MathDataset(val_prompts, val_targets)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    all_preds = []
    
    print("Running inference on validation set...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(model_peft.device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                outputs = model_peft(**encodings, output_hidden_states=True)
                logits = head(outputs.hidden_states[-1][:, -1, :])
                preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_preds.extend(preds)

    print("\n--- Final 4-bit Evaluation Results ---")
    target_names = ["0: Alg", "1: Geo", "2: Calc", "3: Prob", "4: Num", "5: Combo", "6: Lin.Alg", "7: Abs.Alg"]
    print(f"\nFinal F1 Score (Micro): {f1_score(val_targets, all_preds, average='micro'):.4f}")
    print(f"Final F1 Score (Macro): {f1_score(val_targets, all_preds, average='macro'):.4f}")
    print("\nFull Classification Report:")
    print(classification_report(val_targets, all_preds, target_names=target_names))

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH)
        df_val = pd.read_csv(VAL_DATA_PATH)
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        return

    df_train.columns = ['problem', 'target']; df_val.columns = ['problem', 'target']

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for final training.")
        args = (world_size, train_prompts, train_targets); mp.spawn(train_process, args=args, nprocs=world_size)
        evaluate_model(val_prompts, val_targets)
    else:
        print("No GPUs found.")

if __name__ == '__main__':
    main()

Writing train_low_and_slow_4bit.py


In [14]:
!python train_low_and_slow_4bit.py

2025-06-21 23:23:02.473817: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750548182.496257    3395 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750548182.503083    3395 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for final training.
2025-06-21 23:23:10.834992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750548190.856744    3463 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

# Experiment 7: "Low and Slow" Maximum Performance (8-bit)
This is the 8-bit version of the "low and slow" strategy.

In [15]:
%%writefile train_low_and_slow_8bit.py

# STRATEGY: Maximize 8-bit model performance with a "low and slow" approach.
# 1. Quantization: 8-bit
# 2. Max LoRA Capacity: r=32, lora_alpha=64.
# 3. More Epochs: Trains for 5 epochs.
# 4. Lower Learning Rate: Uses a smaller learning rate of 5e-5.

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# DDP Imports
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './low_slow_8bit_backbone/'
FINAL_HEAD_PATH = './low_slow_8bit_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None:
            item["target"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.prompts)

# --- Model Definition ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        # --- KEY CHANGE: Using 8-bit quantization ---
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        
        self.backbone = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=32, lora_alpha=64, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Training Process ---
def train_process(rank, world_size, train_prompts, train_targets):
    ddp_setup(rank, world_size)
    if rank == 0: print(f"--- Starting 'Low and Slow' 8-bit training on {world_size} GPUs ---")

    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')

    for epoch in range(NUM_EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
    if rank == 0:
        print("\nTraining complete. Saving final 8-bit model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final 8-bit model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Evaluation Process ---
def evaluate_model(val_prompts, val_targets):
    print("\n--- Starting Evaluation of 8-bit Model ---")
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)

    # Print model size
    total_params = sum(p.numel() for p in base_model.parameters())
    print(f"Loaded base model with {total_params/1_000_000_000:.2f}B parameters.")
    
    model_peft = PeftModel.from_pretrained(base_model, FINAL_BACKBONE_PATH)

    # Print model size after PEFT
    total_params_peft = sum(p.numel() for p in model_peft.parameters() if p.requires_grad)
    print(f"Trainable PEFT model parameters: {total_params_peft/1_000_000:.2f}M")
    
    head = nn.Linear(AutoConfig.from_pretrained(BASE_MODEL_PATH).hidden_size, 8, bias=False)
    head.load_state_dict(torch.load(FINAL_HEAD_PATH)); head.to(model_peft.device); model_peft.eval(); head.eval()

    val_dataset = MathDataset(val_prompts, val_targets)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    all_preds = []
    
    print("Running inference on validation set...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(model_peft.device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                outputs = model_peft(**encodings, output_hidden_states=True)
                logits = head(outputs.hidden_states[-1][:, -1, :])
                preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_preds.extend(preds)

    print("\n--- Final 8-bit Evaluation Results ---")
    target_names = ["0: Alg", "1: Geo", "2: Calc", "3: Prob", "4: Num", "5: Combo", "6: Lin.Alg", "7: Abs.Alg"]
    print(f"\nFinal F1 Score (Micro): {f1_score(val_targets, all_preds, average='micro'):.4f}")
    print(f"Final F1 Score (Macro): {f1_score(val_targets, all_preds, average='macro'):.4f}")
    print("\nFull Classification Report:")
    print(classification_report(val_targets, all_preds, target_names=target_names))

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH)
        df_val = pd.read_csv(VAL_DATA_PATH)
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        return

    df_train.columns = ['problem', 'target']; df_val.columns = ['problem', 'target']

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for final training.")
        args = (world_size, train_prompts, train_targets); mp.spawn(train_process, args=args, nprocs=world_size)
        evaluate_model(val_prompts, val_targets)
    else:
        print("No GPUs found.")

if __name__ == '__main__':
    main()

Writing train_low_and_slow_8bit.py


In [16]:
!python train_low_and_slow_8bit.py

2025-06-21 23:39:32.390555: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750549172.413292    3898 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750549172.420135    3898 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for final training.
2025-06-21 23:39:40.764022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750549180.786216    3966 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

# Experiment 8: Dynamic Adaptive Quantization(Novelty)

In [None]:
%%writefile train_dynamic_adaptive_full.py


import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from collections import defaultdict

# DDP, Transformers, PEFT imports
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
import bitsandbytes as bnb
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from huggingface_hub import snapshot_download

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './dynamic_adaptive_backbone/'
FINAL_HEAD_PATH = './dynamic_adaptive_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

# --- Model Definition with Dynamic Quantization ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        if rank == 0: print("--- Determining dynamic quantization policy ---")
        with init_empty_weights():
            meta_model = AutoModelForCausalLM.from_config(self.config)
        
        layer_sizes = defaultdict(list)
        for name, module in meta_model.named_modules():
            if isinstance(module, nn.Linear):
                size = module.in_features * module.out_features
                layer_sizes[size].append(name)
        
        unique_sizes = sorted(layer_sizes.keys(), reverse=True)
        if rank == 0: print(f"Found {len(unique_sizes)} unique linear layer sizes.")

        policy_map = {}
        for size in unique_sizes[:2]:
            for layer_name in layer_sizes[size]: policy_map[layer_name] = 8
        for size in unique_sizes[2:]:
            for layer_name in layer_sizes[size]: policy_map[layer_name] = 4
        
        if rank == 0:
            eight_bit_count = sum(1 for v in policy_map.values() if v == 8)
            four_bit_count = sum(1 for v in policy_map.values() if v == 4)
            print(f"Policy determined: {eight_bit_count} layers to 8-bit, {four_bit_count} layers to 4-bit.")
            print("Building custom model...")

        with init_empty_weights():
            model_to_quantize = AutoModelForCausalLM.from_config(self.config)
            
            # Use list() to create a copy, allowing modification during iteration
            for name, module in list(model_to_quantize.named_modules()):
                if isinstance(module, nn.Linear) and name in policy_map:
                    if policy_map[name] == 8:
                        new_module = bnb.nn.Linear8bitLt(module.in_features, module.out_features, bias=module.bias is not None, has_fp16_weights=False, threshold=6.0)
                    else:
                        new_module = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None, compute_dtype=torch.bfloat16, compress_statistics=True, quant_type='nf4')
                    
                    # --- FIX: Handle both nested and top-level modules ---
                    if '.' in name:
                        parent_name, child_name = name.rsplit('.', 1)
                        parent_module = model_to_quantize.get_submodule(parent_name)
                        setattr(parent_module, child_name, new_module)
                    else:
                        setattr(model_to_quantize, name, new_module)
        
        model_to_quantize.is_quantized = True

        if rank == 0: print("Loading weights into adaptively quantized model...")
        local_model_path = snapshot_download(repo_id=model_path)
        
        self.backbone = load_checkpoint_and_dispatch(
            model_to_quantize, local_model_path, device_map={'':rank},
            no_split_module_classes=["Qwen2DecoderLayer"], dtype=torch.bfloat16
        )
        if rank == 0: print("Model loading complete.")

        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    
    try:
        model = Net(BASE_MODEL_PATH, rank).to(rank)
        if rank == 0:
            # Print model size
            total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    except Exception as e:
        print(f"[GPU {rank}] Failed during model initialization: {e}")
        destroy_process_group(); return

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward();
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')

    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
        df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        return

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for adaptive training.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else:
        print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_dynamic_adaptive_full.py


In [18]:
!python train_dynamic_adaptive_full.py

2025-06-22 00:00:51.818655: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750550451.841051    4401 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750550451.847938    4401 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for adaptive training.
2025-06-22 00:01:00.382518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750550460.404518    4469 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:0

# Experiment 9: Dynamic Adaptive Hybrid Model
This highly experimental approach loads both a 4-bit and 8-bit model into memory and "transplants" layers to create a hybrid.

In [None]:
%%writefile train_dynamic_adaptive_hybrid.py

# A "Hybrid Model" approach to dynamic adaptive quantization.
# This script loads a full 8-bit and a full 4-bit model, then combines them.


import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from collections import defaultdict
import gc # For garbage collection

# DDP, Transformers, PEFT imports
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
from accelerate import init_empty_weights

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = './dynamic_hybrid_backbone/'
FINAL_HEAD_PATH = './dynamic_hybrid_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# --- Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

# --- Model Definition with Hybrid Strategy ---
class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        
        # Stage 1: Determine the quantization policy on the main process
        policy_map = {}
        if rank == 0:
            print("--- Determining dynamic quantization policy ---")
            with init_empty_weights():
                meta_model = AutoModelForCausalLM.from_config(self.config)
            
            layer_sizes = defaultdict(list)
            for name, module in meta_model.named_modules():
                if isinstance(module, nn.Linear):
                    layer_sizes[module.in_features * module.out_features].append(name)
            
            unique_sizes = sorted(layer_sizes.keys(), reverse=True)
            print(f"Found {len(unique_sizes)} unique linear layer sizes.")

            for size in unique_sizes[:2]:
                for layer_name in layer_sizes[size]: policy_map[layer_name] = 8
            for size in unique_sizes[2:]:
                for layer_name in layer_sizes[size]: policy_map[layer_name] = 4
            
            eight_bit_count = sum(1 for v in policy_map.values() if v == 8)
            four_bit_count = sum(1 for v in policy_map.values() if v == 4)
            print(f"Policy determined: {eight_bit_count} layers to 8-bit, {four_bit_count} layers to 4-bit.")
            del meta_model
            gc.collect()

        # Broadcast the policy map from rank 0 to all other processes
        policy_obj = [policy_map]
        torch.distributed.broadcast_object_list(policy_obj, src=0)
        policy_map = policy_obj[0]

        # Stage 2: Load two full models
        if rank == 0: print("Loading full 8-bit and 4-bit models (this is memory intensive)...")
        
        bnb_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
        model_8bit = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config_8bit, device_map=rank, torch_dtype=torch.bfloat16)
        
        bnb_config_4bit = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        model_4bit = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config_4bit, device_map=rank, torch_dtype=torch.bfloat16)

        # Stage 3: Perform layer "transplants"
        if rank == 0: print("Creating hybrid model by swapping layers...")
        self.backbone = model_8bit

        for name, _ in self.backbone.named_modules():
            if name in policy_map and policy_map[name] == 4:
                parent_name, child_name = name.rsplit('.', 1)
                parent_module = self.backbone.get_submodule(parent_name)
                source_layer = model_4bit.get_submodule(name)
                setattr(parent_module, child_name, source_layer)

        # Stage 4: Clean up and apply PEFT
        del model_4bit
        gc.collect()
        torch.cuda.empty_cache()
        if rank == 0: print("Hybrid model created. Applying LoRA...")

        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

# --- DDP Setup ---
def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# --- Combined Training and Validation Process ---
def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    
    model = Net(BASE_MODEL_PATH, rank).to(rank)
    if rank == 0:
        # Print model size
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Trainable model parameters: {total_params/1_000_000:.2f}M")
    model = DDP(model, device_ids=[rank], find_unused_parameters=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader

        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')

    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
        print(f"Final model saved to '{FINAL_BACKBONE_PATH}' and '{FINAL_HEAD_PATH}'")

    destroy_process_group()

# --- Launcher Function ---
def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    
    try:
        df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
        df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    except FileNotFoundError:
        print(f"Error: Make sure '{TRAIN_DATA_PATH}' and '{VAL_DATA_PATH}' exist.")
        return

    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for adaptive training.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else:
        print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_dynamic_adaptive_hybrid.py


In [20]:
!python train_dynamic_adaptive_hybrid.py

2025-06-22 00:01:48.989356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750550509.011646    4866 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750550509.018481    4866 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for adaptive training.
2025-06-22 00:01:57.311077: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750550517.332972    4934 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:0