In [None]:
%pip install datasets transformers sacrebleu peft loralib rouge_score evaluate -q

In [1]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, TaskType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [3]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi") # Returns a DatasetDict

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [5]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Data Preprocessing

In [6]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [7]:
tokenizer(text = "Hello, this is a sentence!")

{'input_ids': [12110, 2, 90, 23, 19, 8800, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer(text = ["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[12110, 2, 90, 23, 19, 8800, 61, 0], [239, 23, 414, 8800, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [9]:
# Tokenize using the Decoder specific toeknization rules.
print(tokenizer(text_target = ["Hello, this is a sentence!", "एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[2204, 10967, 818, 2, 90, 23, 19, 44, 16, 4072, 1936, 5386, 61, 0], [26618, 16155, 346, 33383, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


#### We can see that the encoder and the decoder use the same tokenization logic.

In [10]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
target_lang = 'hi'

def preprocess_function(examples):
  inputs = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[target_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length = max_input_length, truncation = True)

  # Setup the tokenization for targets
  labels = tokenizer(text_target=targets, max_length = max_target_length, truncation = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [11]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [12]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched = True) # Applies the function 'preprocess_function' to each split (train, test, val)
tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names) # Removes the columns that are not needed anymore

In [13]:
def print_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [14]:
# Selection of the model architecture, using the weights from the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, torch_dtype = torch.bfloat16)

print(print_model_parameters(model))

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM,
)

peft_model = get_peft_model(model, lora_config).to(device)
print(print_model_parameters(peft_model))

trainable model parameters: 75856896
all model parameters: 76381184
percentage of trainable model parameters: 99.31%
trainable model parameters: 589824
all model parameters: 76971008
percentage of trainable model parameters: 0.77%


In [15]:
# The Data Collator will take the data in batches rather than the whole to pass it to the model
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt"
)

In [16]:
generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt",
    pad_to_multiple_of = 8
)

In [17]:
batch_size = 8
learning_rate = 5e-5
weight_decay = 0.01

In [18]:
train_data = DataLoader(
    tokenized_datasets['train'],
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)

In [19]:
validation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = data_collator
)

In [20]:
generation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = generation_data_collator
)

In [21]:
num_train_epochs = 5
num_train_steps = num_train_epochs * len(train_data)

In [22]:
optimizer = AdamW(params = peft_model.parameters(), lr = learning_rate, weight_decay = weight_decay)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

### Model Training

In [23]:
def evaluate(model, val_dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    num_batches = 0

    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    return avg_loss

In [None]:
for epoch in range(num_train_epochs):
    peft_model.train()
    total_loss = 0

    print(epoch)
    i = 0
    for batch in train_data:
        batch = {k: v.to(device) for k, v in batch.items() if k != "token_type_ids" }
        outputs = peft_model(**batch)
        loss = outputs.loss

        print(f"Batch {i}: Loss: {loss.item()}")
        i += 1

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_data)
    avg_val_loss = evaluate(peft_model, validation_data)

    print(f"Epoch {epoch+1}/{num_train_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

0
Batch 0: Loss: 3.625
Batch 1: Loss: 2.953125
Batch 2: Loss: 2.484375
Batch 3: Loss: 2.765625
Batch 4: Loss: 2.921875
Batch 5: Loss: 3.21875
Batch 6: Loss: 3.40625
Batch 7: Loss: 4.71875
Batch 8: Loss: 4.875
Batch 9: Loss: 3.296875
Batch 10: Loss: 1.8984375
Batch 11: Loss: 3.8125
Batch 12: Loss: 2.375
Batch 13: Loss: 5.09375
Batch 14: Loss: 3.234375
Batch 15: Loss: 2.3125
Batch 16: Loss: 3.015625
Batch 17: Loss: 4.84375
Batch 18: Loss: 3.140625
Batch 19: Loss: 2.375
Batch 20: Loss: 4.25
Batch 21: Loss: 2.328125
Batch 22: Loss: 4.28125
Batch 23: Loss: 4.59375
Batch 24: Loss: 3.875
Batch 25: Loss: 3.171875
Batch 26: Loss: 3.171875
Batch 27: Loss: 2.984375
Batch 28: Loss: 1.96875
Batch 29: Loss: 2.484375
Batch 30: Loss: 2.6875
Batch 31: Loss: 2.046875
Batch 32: Loss: 3.296875
Batch 33: Loss: 3.09375
Batch 34: Loss: 5.21875
Batch 35: Loss: 2.453125
Batch 36: Loss: 3.75
Batch 37: Loss: 4.34375
Batch 38: Loss: 2.921875
Batch 39: Loss: 3.4375
Batch 40: Loss: 3.5
Batch 41: Loss: 2.875
Batch 4

### Saving the model

In [None]:
peft_model.save_pretrained("./hel_Finetune_en_hi_2")
tokenizer.save_pretrained("./hel_Finetune_en_hi_2")

## Model Testing

In [None]:
input_text = "today is a sunny day"

tokenized = tokenizer(
    [input_text],
    return_tensors = 'pt'
).to(device)
out = peft_model.generate(**tokenized, max_length = 128)
print(out)

In [None]:
print(tokenizer.decode(out[0]))