In [1]:
!pip install transformers
!pip install huggingface_hub
!pip install accelerate>=0.26.0
!pip install peft
!pip install scikit-learn
!pip install bitsandbytes

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [2]:
import pandas as pd
import os

dataset = pd.read_csv('/kaggle/input/bangla-numinamath-cot-translated/BanglaNuminaDataset.csv')
dataset['problem'].dropna(inplace=True)
dataset['solution'].dropna(inplace=True)
dataset = dataset.reset_index(drop=True)

TEST_SIZE = 0.2
SEED = 41
# model_name = "AI-MO/NuminaMath-7B-TIR"
model_name = "deepseek-ai/deepseek-math-7b-instruct"

final_model_dir = './Output/'
checkpoint_save_dir = "./Output/"

DEVICE = 'cuda'
EPOCH = 20
TRAIN_BATCH_SIZE = 3
EVAL_BATCH_SIZE = 2
LEARNING_RATE = 2e-4
PATIENCE = 8

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class MathDaraset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        # self.max_len = self.tokenizer.model_max_length 
        self.max_len = 3000

    def __len__(self):
        return len(self.dataset)
    
    # def __getitem__(self, index):
    #     instruction = self.dataset['problem'][index]
    #     response = self.dataset['solution'][index]
        
    #     input_text = f"Instruction: {instruction}\nPlease solve the problem step by step and put the result in \\boxed{{}}"
        
    #     input_encodings = self.tokenizer(input_text, return_tensors="pt")
    #     labels = self.tokenizer(response, return_tensors="pt")['input_ids']

    #     labels[labels == self.tokenizer.pad_token_id] = -100
        
    #     return {
    #         'input_ids': input_encodings['input_ids'].squeeze(),
    #         # 'attention_mask': input_encodings['attention_mask'].squeeze(),
    #         'labels': labels.squeeze()
    #     }

    def __getitem__(self, index):
        instruction = self.dataset['problem'][index]
        response = self.dataset['solution'][index]
        
        input_encodings = self.tokenizer(instruction, padding='max_length', max_length=self.max_len, truncation=True, return_tensors="pt")
        target_encodings = self.tokenizer(response, padding='max_length', max_length=self.max_len, truncation=True, return_tensors="pt")

        input_ids = input_encodings["input_ids"].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = target_encodings['input_ids'].squeeze()

        # Replace padding token id's of the labels by -100
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            'attention_mask': attention_mask,
            "labels": labels
        }
        
        
train_df, test_df = train_test_split(dataset, test_size = TEST_SIZE, random_state = SEED)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

train_dataset = MathDaraset(train_df, tokenizer=tokenizer)
test_dataset = MathDaraset(test_df, tokenizer=tokenizer)

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from transformers import BitsAndBytesConfig

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map=torch.cuda.current_device(), quantization_config=quant_config)
from transformers import GenerationConfig
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

from transformers import TrainingArguments, Trainer, Seq2SeqTrainer, EarlyStoppingCallback

from dataclasses import dataclass
from typing import Dict, List, Union
import torch
from transformers import PreTrainedTokenizerBase
import transformers
from typing import Sequence
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt",
    label_pad_token_id=-100
)

training_args = TrainingArguments(
    output_dir=checkpoint_save_dir,
    num_train_epochs=EPOCH,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    bf16=True,
    save_total_limit=3,
    dataloader_pin_memory=False,
    # logging_steps=100,
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    dataloader_num_workers = 4,
    save_strategy = "steps",
    eval_strategy="steps",
    save_steps = 1000,
    logging_steps = 200,
    eval_steps = 500,
    # device = DEVICE,
    # generation_config=model.generation_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
)

print(f"Training dataset of length {train_dataset.__len__()}")
print(f"Evaluation dataset of length {test_dataset.__len__()}")

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569
Training dataset of length 2389
Evaluation dataset of length 598


In [None]:
# # Initialize Weights & Biases (W&B) in disabled mode. s
 
import wandb
wandb.init(mode="disabled")

In [4]:
import warnings
warnings.filterwarnings("ignore")

trainer.train()

model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.61 GiB. GPU 0 has a total capacity of 15.89 GiB of which 1.35 GiB is free. Process 3779 has 14.54 GiB memory in use. Of the allocated memory 12.26 GiB is allocated by PyTorch, and 1.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)