In [1]:
pip install -q torch transformers peft bitsandbytes accelerate pandas scikit-learn numpy tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependen

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# This path should be correct inside your Kaggle notebook
train_csv_path = '/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv'

try:
    df = pd.read_csv(train_csv_path)
    # It's safer to rename columns in case the header is inconsistent
    df.columns = ['problem', 'target']

    # Split the data (90% for training, 10% for validation)
    # stratify=df['target'] ensures both sets have a similar distribution of categories
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['target'])

    # Save the new files to your /kaggle/working/ directory
    train_df.to_csv('final_train_set.csv', index=False)
    val_df.to_csv('final_validation_set.csv', index=False)

    print("Successfully created 'final_train_set.csv' and 'final_validation_set.csv'")
    print(f"Final training set size: {len(train_df)} rows")
    print(f"Final validation set size: {len(val_df)} rows")

except FileNotFoundError:
    print(f"Error: Could not find the file at '{train_csv_path}'. Please double-check the path.")

Successfully created 'final_train_set.csv' and 'final_validation_set.csv'
Final training set size: 9170 rows
Final validation set size: 1019 rows


# 25, 50, 75% Qwen 2-0.5b (4-bit)

# Experiment 10

In [3]:
%%writefile prune_model_25pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---\n",
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.25 
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model ({SPARSITY_AMOUNT*100}%) ---")
    
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        print(f"Removing existing directory: {PRUNED_MODEL_SAVE_DIR}")
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")

    print(f"Loading full-precision model: {BASE_MODEL_PATH}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')

    print("\nPruning complete.")
    print(f"\nSaving {SPARSITY_AMOUNT*100}% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    print("Pruned model saved successfully.")
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Writing prune_model_25pct.py


In [4]:
!python prune_model_25pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (25.0%) ---
Loading full-precision model: Qwen/Qwen2-0.5B-Instruct
config.json: 100%|█████████████████████████████| 659/659 [00:00<00:00, 4.80MB/s]
2025-06-22 03:32:44.922234: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750563165.145810     113 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750563165.214078     113 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
mod

In [5]:
%%writefile train_pruned_quantized_25pct_4bit.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.25
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)

    # --- ADDED: Print model size ---
    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")
    # --------------------------------

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 4-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_25pct_4bit.py


In [6]:
!python train_pruned_quantized_25pct_4bit.py

2025-06-22 03:33:14.033563: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750563194.055867     186 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750563194.062690     186 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 25.0% sparse 4-bit model.
2025-06-22 03:33:22.501907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750563202.523541     254 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

# Experiment 11

In [7]:
%%writefile prune_model_50pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---\n",
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.50  # Set to 50%
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model (50%) ---")
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        print(f"Removing existing directory: {PRUNED_MODEL_SAVE_DIR}")
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')
    print("\nPruning complete.")
    print(f"\nSaving 50% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    print("Pruned 50% model saved successfully.")
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Writing prune_model_50pct.py


In [8]:
!python prune_model_50pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (50%) ---
2025-06-22 03:49:07.041628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750564147.063657     661 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750564147.070335     661 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Applying 50.0% unstructured pruning...

Pruning complete.

Saving 50% pruned model to /kaggle/working/pruned_50pct_0.5B_model...
Pruned 50% model saved successfully.


In [9]:
%%writefile train_pruned_quantized_50pct_4bit.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.50
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)

    # --- ADDED: Print model size ---
    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")
    # --------------------------------

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 4-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_50pct_4bit.py


In [10]:
!python train_pruned_quantized_50pct_4bit.py

2025-06-22 03:49:22.068192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750564162.090496     733 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750564162.097333     733 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 50.0% sparse 4-bit model.
2025-06-22 03:49:30.239769: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750564170.261215     801 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

# Experiment 12

In [11]:
%%writefile prune_model_75pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---\n",
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.75
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model ({SPARSITY_AMOUNT*100}%) ---")
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')
    print("\nPruning complete.")
    print(f"\nSaving {SPARSITY_AMOUNT*100}% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Writing prune_model_75pct.py


In [12]:
!python prune_model_75pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (75.0%) ---
2025-06-22 04:05:16.043239: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750565116.065619    1208 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750565116.072510    1208 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Applying 75.0% unstructured pruning...

Pruning complete.

Saving 75.0% pruned model to /kaggle/working/pruned_75pct_0.5B_model...


In [13]:
%%writefile train_pruned_quantized_75pct_4bit.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.75
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q4_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)

    # --- ADDED: Print model size ---
    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")
    # --------------------------------

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 4-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_75pct_4bit.py


In [14]:
!python train_pruned_quantized_75pct_4bit.py

2025-06-22 04:05:31.708691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750565131.731014    1280 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750565131.737826    1280 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 75.0% sparse 4-bit model.
2025-06-22 04:05:39.852467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750565139.873978    1348 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

# 25, 50, 75% Qwen 2-0.5b (8-bit)

# Experiment 13

In [15]:
%%writefile prune_model_25pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.25
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model ({SPARSITY_AMOUNT*100}%) ---")
    
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        print(f"Removing existing directory: {PRUNED_MODEL_SAVE_DIR}")
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")

    print(f"Loading full-precision model: {BASE_MODEL_PATH}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')

    print("\nPruning complete.")
    print(f"\nSaving {SPARSITY_AMOUNT*100}% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    print("Pruned model saved successfully.")
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Overwriting prune_model_25pct.py


In [16]:
# Execute the script
!python prune_model_25pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (25.0%) ---
Removing existing directory: /kaggle/working/pruned_25pct_0.5B_model
Loading full-precision model: Qwen/Qwen2-0.5B-Instruct
2025-06-22 04:21:28.563960: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750566088.586182    1755 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750566088.592876    1755 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Applying 25.0% unstructured pruning...

Pruning complete.

Saving 25.0% pruned model to /kaggle/working/pruned_25pct_0.5B_model...
Pr

In [17]:
%%writefile train_pruned_quantized_8bit_25pct.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.25
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)

    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 8-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_8bit_25pct.py


In [18]:
# Execute the script
!python train_pruned_quantized_8bit_25pct.py

2025-06-22 04:21:44.441766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750566104.463837    1829 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750566104.470582    1829 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 25.0% sparse 8-bit model.
2025-06-22 04:21:52.671080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750566112.692579    1897 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

# Experiment 14

In [19]:
%%writefile prune_model_50pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.50
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model ({SPARSITY_AMOUNT*100}%) ---")
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        print(f"Removing existing directory: {PRUNED_MODEL_SAVE_DIR}")
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')
    print("\nPruning complete.")
    print(f"\nSaving {SPARSITY_AMOUNT*100}% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Overwriting prune_model_50pct.py


In [20]:
!python prune_model_50pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (50.0%) ---
Removing existing directory: /kaggle/working/pruned_50pct_0.5B_model
2025-06-22 04:45:39.633853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750567539.656583    2304 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750567539.663334    2304 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Applying 50.0% unstructured pruning...

Pruning complete.

Saving 50.0% pruned model to /kaggle/working/pruned_50pct_0.5B_model...


In [21]:
%%writefile train_pruned_quantized_8bit_50pct.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.50
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)

    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 8-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_8bit_50pct.py


In [22]:
!python train_pruned_quantized_8bit_50pct.py

2025-06-22 04:45:56.012061: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750567556.034089    2378 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750567556.040825    2378 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 50.0% sparse 8-bit model.
2025-06-22 04:46:04.148889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750567564.170340    2446 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

# Experiment 15

In [23]:
%%writefile prune_model_75pct.py

import torch
from torch.nn.utils import prune
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os

# --- Configuration ---
BASE_MODEL_PATH = 'Qwen/Qwen2-0.5B-Instruct'
SPARSITY_AMOUNT = 0.75
PRUNED_MODEL_SAVE_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def run_pruning():
    print(f"--- STAGE 1: Pruning Full-Precision 0.5B Model ({SPARSITY_AMOUNT*100}%) ---")
    if os.path.exists(PRUNED_MODEL_SAVE_DIR):
        print(f"Removing existing directory: {PRUNED_MODEL_SAVE_DIR}")
        os.system(f"rm -rf {PRUNED_MODEL_SAVE_DIR}")
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
    print(f"\nApplying {SPARSITY_AMOUNT*100}% unstructured pruning...")
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=SPARSITY_AMOUNT)
            prune.remove(module, 'weight')
    print("\nPruning complete.")
    print(f"\nSaving {SPARSITY_AMOUNT*100}% pruned model to {PRUNED_MODEL_SAVE_DIR}...")
    model.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(PRUNED_MODEL_SAVE_DIR)
    del model
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == '__main__':
    run_pruning()

Overwriting prune_model_75pct.py


In [24]:
!python prune_model_75pct.py

--- STAGE 1: Pruning Full-Precision 0.5B Model (75.0%) ---
Removing existing directory: /kaggle/working/pruned_75pct_0.5B_model
2025-06-22 05:09:46.290604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750568986.312689    2853 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750568986.319480    2853 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Applying 75.0% unstructured pruning...

Pruning complete.

Saving 75.0% pruned model to /kaggle/working/pruned_75pct_0.5B_model...


In [25]:
%%writefile train_pruned_quantized_8bit_75pct.py

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# --- Configuration ---
SPARSITY_AMOUNT = 0.75
PRUNED_MODEL_DIR = f'/kaggle/working/pruned_{int(SPARSITY_AMOUNT*100)}pct_0.5B_model/'
TRAIN_DATA_PATH = './final_train_set.csv'
VAL_DATA_PATH = './final_validation_set.csv'
FINAL_BACKBONE_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_backbone/'
FINAL_HEAD_PATH = f'/kaggle/working/pruned_q8_{int(SPARSITY_AMOUNT*100)}pct_0.5B_head.pt'

NUM_EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
GRAD_ACCUM_STEPS = 4
MAX_LEN = 512
SEED = 42

tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

class MathDataset(Dataset):
    def __init__(self, prompts, targets=None):
        self.prompts, self.targets = prompts, targets
    def __getitem__(self, idx):
        item = {"prompt": self.prompts[idx]}
        if self.targets is not None: item["target"] = self.targets[idx]
        return item
    def __len__(self): return len(self.prompts)

class Net(nn.Module):
    def __init__(self, pruned_model_path, rank):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(pruned_model_path)
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        self.backbone = AutoModelForCausalLM.from_pretrained(pruned_model_path, quantization_config=bnb_config, device_map=rank, torch_dtype=torch.bfloat16, use_cache=False)
        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias='none', inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)
    def forward(self, x):
        outputs = self.backbone(**x, output_hidden_states=True)
        return self.head(outputs.hidden_states[-1][:, -1, :])

def ddp_setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def train_and_validate_process(rank, world_size, train_prompts, train_targets, val_prompts, val_targets):
    ddp_setup(rank, world_size)
    model = Net(PRUNED_MODEL_DIR, rank).to(rank)
    
    if rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"\nLoaded pruned, quantized model with {total_params/1_000_000:.2f}M total parameters.")
        print(f"Trainable LoRA parameters: {trainable_params/1_000_000:.2f}M\n")

    model = DDP(model, device_ids=[rank], find_unused_parameters=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    train_dataset = MathDataset(train_prompts, train_targets)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, pin_memory=True, drop_last=True)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) // GRAD_ACCUM_STEPS) * NUM_EPOCHS)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(NUM_EPOCHS):
        model.train(); train_sampler.set_epoch(epoch)
        if rank == 0: print(f"  Epoch {epoch+1}/{NUM_EPOCHS}"); pbar = tqdm(train_loader, desc="Training")
        else: pbar = train_loader
        for step, batch in enumerate(pbar):
            encodings = tokenizer(batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
            batch_targets = batch['target'].long().to(rank)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits = model(encodings)
                loss = nn.functional.cross_entropy(logits, batch_targets)
                loss = loss / GRAD_ACCUM_STEPS
            scaler.scale(loss).backward()
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
        if rank == 0:
            model.eval()
            val_dataset = MathDataset(val_prompts, val_targets)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
            all_preds = []
            with torch.no_grad():
                for v_batch in tqdm(val_loader, desc="Validating"):
                    encodings = tokenizer(v_batch['prompt'], return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN).to(rank)
                    with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                        logits = model.module(encodings)
                        preds = torch.argmax(logits, dim=1).cpu().tolist()
                    all_preds.extend(preds)
            f1_micro = f1_score(val_targets, all_preds, average='micro')
            print(f'  Epoch {epoch+1} | Validation F1-micro: {f1_micro:.4f}')
    if rank == 0:
        print("\nTraining complete. Saving final model...")
        model.module.backbone.save_pretrained(FINAL_BACKBONE_PATH)
        torch.save(model.module.head.state_dict(), FINAL_HEAD_PATH)
    destroy_process_group()

def main():
    if not os.path.exists(PRUNED_MODEL_DIR):
        print(f"Error: Pruned model directory not found. Please run the pruning script first.")
        return
    torch.manual_seed(SEED); np.random.seed(SEED)
    df_train = pd.read_csv(TRAIN_DATA_PATH); df_train.columns = ['problem', 'target']
    df_val = pd.read_csv(VAL_DATA_PATH); df_val.columns = ['problem', 'target']
    prompt_template = """<|im_start|>user
Your task is to classify each Math problem...
Math Problem: {problem}
<|im_end|>
<|im_start|>assistant"""
    train_prompts = [prompt_template.format(problem=p.strip()) for p in df_train['problem']]
    train_targets = df_train['target'].astype(int).tolist()
    val_prompts = [prompt_template.format(problem=p.strip()) for p in df_val['problem']]
    val_targets = df_val['target'].astype(int).tolist()
    world_size = torch.cuda.device_count()
    if world_size > 0:
        print(f"Using {world_size} GPUs for training on {SPARSITY_AMOUNT*100}% sparse 8-bit model.")
        args = (world_size, train_prompts, train_targets, val_prompts, val_targets)
        mp.spawn(train_and_validate_process, args=args, nprocs=world_size)
    else: print("This script requires GPUs.")

if __name__ == '__main__':
    main()

Writing train_pruned_quantized_8bit_75pct.py


In [26]:
!python train_pruned_quantized_8bit_75pct.py

2025-06-22 05:10:03.807922: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750569003.829827    2927 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750569003.836558    2927 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using 4 GPUs for training on 75.0% sparse 8-bit model.
2025-06-22 05:10:11.935848: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750569011.957670    2995 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg