In [None]:
!pip install transformers datasets peft bitsandbytes accelerate huggingface_hub

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.1

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name="deepseek-ai/deepseek-math-7b-base"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # Normalized float 4 (nf4) improves quantization accuracy
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply bitsandbytes config
    device_map="auto"
)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,  # Rank of LoRA
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"]  # Apply LoRA on query & value projections
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "math_meme_corrections.csv"}, encoding="ISO-8859-1")

tokenizer.pad_token=tokenizer.eos_token

In [None]:
def gen_prompt(example):
    return f"Math Meme Correction:\nInput: {example['input']}\nOutput: {example['output']}"


def tokenize_function(example):
    full_prompt = gen_prompt(example)
    return tokenizer(full_prompt, truncation=True, padding="max_length", max_length=256)

# Apply tokenization over the dataset
tokenized_data = dataset.map(tokenize_function)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
print(tokenized_data)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})


In [None]:
from transformers import TrainingArguments, Trainer,DataCollatorForLanguageModeling
training_args = TrainingArguments(
    output_dir="Meme-Deepseek-math",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=10,
    save_total_limit=1
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    data_collator=data_collator
)

model.config.use_cache = False

In [None]:
!pip install wandb



In [None]:
trainer.train()

Step,Training Loss
10,10.4437
20,4.3558
30,2.1403
40,0.8574
50,0.6257
60,0.5183
70,0.3603
80,0.3127
90,0.254
100,0.2147


TrainOutput(global_step=180, training_loss=1.1834365493721433, metrics={'train_runtime': 519.9987, 'train_samples_per_second': 1.442, 'train_steps_per_second': 0.346, 'total_flos': 6939208857944064.0, 'train_loss': 1.1834365493721433, 'epoch': 13.88})

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the local save path
save_path = "deepseek-math-lora"

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)



# Save the model and tokenizer locally
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved locally at {save_path}")


✅ Model and tokenizer saved locally at /kaggle/working/deepseek-math-lora


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
#model_name = "your-deepseek-math-model"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForCausalLM.from_pretrained(model_name)

# Define input
prompt = """
input: (10/5)+3 =8?
output:"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate output with strict stopping criteria
output = model.generate(
    input_ids,
    max_new_tokens=50,  # Limits generation to prevent extra text
    num_return_sequences=1,
    temperature=0.7,  # Lower temperature for consistency
    top_p=0.9,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

# Decode response
response = tokenizer.decode(output[0], skip_special_tokens=True)
if "input:" in response[response.index("output:"):]:  # If a second input exists
    response = response[:response.index("input:", response.index("output:"))].strip()



print(response)


input: (10/5)+3 =8?
output: Incorrect! Solve brackets first: (10/5) = 2, then add 3 to get 5. The correct answer is 5.
