This notebook is for training the NLLB 3.3B model for Hindi <-> Indian Language.

This notebook uses QLoRA to finetune the model and needs 15GB VRAM to train the model.

Author: Surupendu Gangopadhyay


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install peft --quiet
!pip install accelerate --quiet
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install sacrebleu --quiet
!pip install bitsandbytes --quiet
!pip install unbabel-comet --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

Pre-requisites before starting the training

In [None]:
# Mention the language code that needs to be translated, this is different for different languages
# Please check the paper of NLLB: https://arxiv.org/pdf/2207.04672 for language codes
# The language codes are mentioned in pages 13, 14 and 15

src_code = "hin_Deva"
tgt_code = "guj_Gujr"

# Set the path for loading the HIMNAGY dataset and convert it into HugginFace format
# Load the splits of the text files available in HIMNAGY repository (change the folder name)
# The folder should contain the train, test and dev set.
src_path = "/media/sda2/Share/Surupendu/machine_translation/dataset/HIMANGY/hindi_splits/health/"

# Save the ".tsv" files in the folder (change the folder name)
save_path = "/media/sda2/Share/Surupendu/machine_translation/hugging_face/dataset/splits/"

# Set the output directory where the checkpoints will be stored (change the folder name).
output_dir = "/content/drive/MyDrive/trained_models"

Scripts for storing the train, test and dev set in Huggingface format

In [None]:
import pandas as pd
import os

def create_file(src_path, save_path, file_name):
    """
      Convert the ".txt" file to Huggingface format.
      The file should be in tsv format. The file contains two headers: id and translation
      Each row in the translation field contains {"src_lang_code": <sent>, "tgt_lang_code": <sent>}
    """
    df = pd.read_csv(src_path + file_name, names=[src_code, tgt_code], sep="\t")
    src_lines = [line.rstrip() for line in list(df[src_code].values)]
    tgt_lines = [line.rstrip() for line in list(df[tgt_code].values)]
    ids = [i+1 for i in range(len(df))]
    rows = [{src_code: src_line, tgt_code: tgt_line} for src_line, tgt_line in zip(src_lines, tgt_lines)]
    df = pd.DataFrame({"ids": ids, "translation": rows})
    file_name = file_name.replace("txt", "tsv")
    df.to_csv(save_path + file_name, sep="\t", index=False)

In [None]:
# Run the function for loading
files = os.listdir(src_path)
create_file(src_path, save_path, files[0])
create_file(src_path, save_path, files[1])
create_file(src_path, save_path, files[2])

Script for finetuning the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig
from datasets import load_dataset
import tqdm as tq
import numpy as np
import evaluate
import torch
import ast
import os

"""
  Import the libraries and metric for evaluating the validation set
"""

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
def preprocess_function(src_code, tgt_code):
  """
    Preprocess the lines in the source file and target file
  """
    def preprocess(lines):
        src_lines = [ast.literal_eval(line)[src_code] for line in lines["translation"]]
        tgt_lines = [ast.literal_eval(line)[tgt_code] for line in lines["translation"]]
        model_tokens = tokenizer(src_lines, text_target=tgt_lines, return_tensors="pt", max_length=100, padding=True, truncation=True)
        return model_tokens
    return preprocess

def postprocess_text(preds, labels):
  """
    Postprocess the predictions and labels
  """
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
  """
    Compute the BLEU scores over the validation set
  """
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


# Use Bitsandbytes for quantizing the model to 4 bits
config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_compute_dtype=torch.bfloat16,
                )

# Intialize the tokenizer for NLLB 3.3B
tokenizer = AutoTokenizer.from_pretrained(
                    "facebook/nllb-200-3.3B", use_fast=True,
                    model_max_length=512,
                    src_lang = src_code, tgt_lang=tgt_code,
                )

# Intialize the model for Seq2Seq function
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B", quantization_config=config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Load the huggingface dataset
path = save_path
train_dataset = load_dataset(path, split="train")
val_dataset = load_dataset(path, split="validation")

train_tokenized_dataset = train_dataset.map(preprocess_function(src_code, tgt_code), batched=True)
val_tokenized_dataset = val_dataset.map(preprocess_function(src_code, tgt_code), batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Initialize the LoRA configuration, rank is set to 8 and we are finetuning only the QKV matrices and linear matrix
peft_config = LoraConfig(
                    lora_alpha= 16,
                    lora_dropout=0.1,
                    r=8,
                    bias="none",
                    target_modules= ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
                    task_type="SEQ_2_SEQ_LM",
                )

model = get_peft_model(model, peft_config)

# Set the training arguments for training the model
training_args = Seq2SeqTrainingArguments(
                    output_dir=output_dir,
                    eval_strategy="epoch",
                    learning_rate=2e-5,
                    per_device_train_batch_size=8,
                    per_device_eval_batch_size=8,
                    weight_decay=0.01,
                    save_total_limit=5,
                    num_train_epochs=5,
                    predict_with_generate=True,
                    fp16=True,
                    push_to_hub=False,
                )

# Set the trainer class for finetuning the model
trainer = Seq2SeqTrainer(
                    model=model,
                    args=training_args,
                    train_dataset=train_tokenized_dataset,
                    eval_dataset=val_tokenized_dataset,
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
                )

# Start finetuning the model
trainer.train()



Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4899 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,8.5374,5.305053,35.3394,37.408
2,5.9569,5.008707,35.3458,37.5
3,5.7898,4.951418,35.3839,37.696
4,5.6415,4.93323,35.8188,37.754
5,5.6082,4.92897,36.0378,37.736


TrainOutput(global_step=3065, training_loss=6.1863394757468395, metrics={'train_runtime': 4896.2757, 'train_samples_per_second': 5.003, 'train_steps_per_second': 0.626, 'total_flos': 4.1551817416704e+16, 'train_loss': 6.1863394757468395, 'epoch': 5.0})

Script for testing the finetuned model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from comet import download_model, load_from_checkpoint
from sacrebleu.metrics import BLEU, CHRF
from torch import cuda
import pandas as pd
import tqdm as tq
import ast
import os

# Set path of the folder where the checkpoint folder is present. Select the folder with the higher number.
lora_wgt_path = "/content/drive/MyDrive/trained_models/checkpoint-3065"

# Set the data path where the test data is present
data_path = "/content/drive/MyDrive/data_splits/"

# Set the filename
file_name = "hi_gu_hlt_test.tsv"

# Set the path and filename where prediction will be stored
save_pred_path = "/content/drive/MyDrive/data_splits/pred.txt"

# Set whether load the LoRa weights or not (True or False)
use_adapter = False

def get_comet_score(srcs, hyps, refs):
  """
    Calculate the COMET score
  """
    model_path = download_model("Unbabel/wmt22-comet-da")
    model = load_from_checkpoint(model_path)
    data = [{"src": src, "mt": hyp, "ref": ref} for src, hyp, ref in zip(srcs, hyps, refs)]
    comet_score = model.predict(data, batch_size=8, gpus=1)
    return comet_score[1]


tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B", use_fast=True, src_lang=src_code, model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")

if use_adapter:
  model.load_adapter(lora_wgt_path)

model.to("cuda")

def load_file(path, file_name):
  """
    Load the test data and return two list containing the source lines and reference lines
  """
    df = pd.read_csv(path + file_name, sep="\t")
    lines = list(df["translation"].values)
    src_lines = [ast.literal_eval(line)[src_code].rstrip() for line in lines]
    refs = [ast.literal_eval(line)[tgt_code].rstrip() for line in lines]
    return src_lines, refs

src_lines, refs = load_file(path, file_name)

# Generate the translation
pred_lines = []
for i in tq.tqdm(range(0, len(src_lines), 10)):
    lines = src_lines[i:i+10]
    lines = [line.rstrip() for line in lines]
    inputs = tokenizer(lines, return_tensors="pt", max_length=512, padding=True, truncation=True)
    inputs.to("cuda")
    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code))
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred_lines.extend(translated_text)

# Save the prediction
fp = open(save_pred_path, "w")
for line in pred_lines:
    fp.write(line + "\n")

bleu = BLEU()
chrf = CHRF()

bleu_score = bleu.corpus_score(pred_lines, [refs])
chrf_score = chrf.corpus_score(pred_lines, [refs])
comet_score = get_comet_score(src_lines, pred_lines, refs)

# Print the scores
print("BLEU Score: {:}".format(bleu_score.score))
print("CHRF Score: {:}".format(chrf_score.score))
print("COMET Score: {:}".format(comet_score))


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 50/50 [03:55<00:00,  4.72s/it]


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  self.pid = os.fork()
Predicting DataLoader 0: 100%|██████████| 63/63 [00:07<00:00,  8.74it/s]


BLEU Score: 32.23375186095481
CHRF Score: 60.26529727476214
COMET Score: 0.8286275290250779
