In [1]:
!pip install transformers accelerate peft datasets bitsandbytes torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import bitsandbytes as bnb
import torch.optim as optim
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

MODEL_NAME = "deepseek-ai/deepseek-math-7b-base"
SAVE_PATH = "finetuned_deepseek_math"
MAX_LENGTH = 128
BATCH_SIZE = 1
NUM_EPOCHS = 3

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",            # Set the quantization type (nf4 is a common choice)
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer and model in 4-bit mode
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
)
model.generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# Prepare model for k-bit training and wrap with LoRA via PEFT
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=20,
    lora_alpha=40,
    target_modules=["q_proj", "v_proj"],  # Adjust these target modules as needed for your model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Load CSV dataset using the datasets library.
# Your CSV is assumed to have columns: "problem" and "solution"
raw_dataset = load_dataset("csv", data_files="/kaggle/input/dataset-math/dataset2.csv")["train"]

# Preprocessing: Create a text field combining the problem and solution.
def preprocess(example):
    example["text"] = f"Problem Statement: {example['problem']}\nSolution: {example['solution']}"
    return example

processed_dataset = raw_dataset.map(preprocess)

# Tokenize the prompts
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    
tokenized_dataset = processed_dataset.map(tokenize_fn, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create DataLoader
train_dataloader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=True)


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


tokenizer_config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [5]:
# Raw finetuning loop
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        # Use input_ids as labels for causal LM training.
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {epoch_loss / len(train_dataloader):.4f}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch 1/10 Loss: 17.4189
Epoch 2/10 Loss: 4.9337
Epoch 3/10 Loss: 2.0824
Epoch 4/10 Loss: 1.0375
Epoch 5/10 Loss: 0.7017
Epoch 6/10 Loss: 0.5019
Epoch 7/10 Loss: 0.3605
Epoch 8/10 Loss: 0.2809
Epoch 9/10 Loss: 0.2426
Epoch 10/10 Loss: 0.2252


In [6]:
# Save the fine-tuned model (PEFT adapter weights will be saved)
save_path = "finetuned_deepseek_math"
model.save_pretrained(save_path)
print(f"Fine-tuned model saved to {save_path}")

Fine-tuned model saved to finetuned_deepseek_math


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

SAVE_PATH = "finetuned_deepseek_math"
model_name = "deepseek-ai/deepseek-math-7b-base"

# Load tokenizer and base model with quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# Prepare model for k-bit training and build LoRA configuration (should match training settings)
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=20,
    lora_alpha=40,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Load the fine-tuned adapter weights
model.load_pretrained(SAVE_PATH)
print(f"Loaded fine-tuned model from {SAVE_PATH}")

# Testing on new emoji problems using the loaded, fine-tuned model
model.eval()
test_prompts = [
    "🚗 + 🚗 + 🚗 + 🚗 = 20 → 🚗 =",
    "🎈 + 🎈 + 🎈 = 15 → 🎈 =",
    "🐶 + 🐶 = 12 → 🐶 ="
]

print("\nTest Results:")
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=20,
            generation_config=model.generation_config
        )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nInput: {prompt}\nOutput: {result}")

In [7]:
# Testing on new emoji problems using the loaded, fine-tuned model
model.eval()
test_prompts = [
    "🚗 + 🚗 + 🚗 + 🚗 = 20 → 🚗 =",
    "🎈 + 🎈 + 🎈 = 15 → 🎈 =",
    "🐶 + 🐶 = 12 → 🐶 ="
]

print("\nTest Results:")
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=20,
            generation_config=model.generation_config
        )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nInput: {prompt}\nOutput: {result}")


Test Results:

Input: 🚗 + 🚗 + 🚗 + 🚗 = 20 → 🚗 =
Output: 🚗 + 🚗 + 🚗 + 🚗 = 20 → 🚗 = 4


Input: 🎈 + 🎈 + 🎈 = 15 → 🎈 =
Output: 🎈 + 🎈 + 🎈 = 15 → 🎈 = 5


Input: 🐶 + 🐶 = 12 → 🐶 =
Output: 🐶 + 🐶 = 12 → 🐶 = 6

