## Imports..


In [None]:
! pip install -q transformers[torch] datasets

In [3]:
from datasets import load_dataset
from transformers import pipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer

## Prepare Data

In [27]:
hin_en_data = load_dataset("hind_encorp", split="train")
hin_en_data


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
    num_rows: 273885
})

In [5]:
hin_en_data[:10]["translation"]

[{'en': 'Sharaabi', 'hi': 'शराबी'},
 {'en': 'politicians do not have permission to do what needs to be done.',
  'hi': 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .'},
 {'en': "I'd like to tell you about one such child,",
  'hi': 'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,'},
 {'en': 'This percentage is even greater than the percentage in India.',
  'hi': 'यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।'},
 {'en': '- John Collins', 'hi': '- जॉन कॉलिन्स'},
 {'en': "what we really mean is that they're bad at not paying attention.",
  'hi': 'हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते'},
 {'en': '%{APPNAME} would like to send notifications, but you need to be signed in to Chrome.',
  'hi': '%{APPNAME} सूचनाएं भेजना चाहता है, लेकिन आपको Chrome में साइन इन होना होगा.'},
 {'en': 'Important Messages', 'hi': 'महत्वपूर्ण संदेश'},
 {'en': "User authentication required for VPN connection '%s'...",
  'hi': "उपयोक्ता सत्यापन VPN संबंधन '%s' के लिए जरूरी है..."},


In [35]:
hin_en_data = hin_en_data.train_test_split(test_size =0.2)
hin_en_data_train = hin_en_data["train"]
hin_en_data_test_val = hin_en_data["test"].train_test_split(0.5)
hin_en_data_test = hin_en_data_test_val["train"]
hin_en_data_dev = hin_en_data_test_val["test"]

hin_en_data_train, hin_en_data_test,hin_en_data_dev

from datasets import DatasetDict
split_dataset = DatasetDict({"train": hin_en_data["train"], "validation": hin_en_data_test_val["train"], "dev": hin_en_data_test_val["test"]})
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 219108
    })
    validation: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 27388
    })
    dev: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 27389
    })
})

In [36]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 219108
    })
    validation: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 27388
    })
    dev: Dataset({
        features: ['id', 'source', 'alignment_type', 'alignment_quality', 'translation'],
        num_rows: 27389
    })
})

In [8]:
!pip install evaluate rouge_score




## Playground with model

In [10]:
!pip install sentencepiece



In [22]:
model_chk = "Helsinki-NLP/opus-mt-en-hi"
translator = pipeline("translation", model=model_chk)
translator("This is a default behaviour")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[{'translation_text': 'यह तयशुदा व्यवहार है'}]

## Preprocessing the input

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_chk, return_tensors="pt")

In [13]:
# an example tokenization -
en_sent = hin_en_data_train[9]["translation"]["en"]
hin_tran = hin_en_data_train[9]["translation"]["hi"]
tkn_in = tokenizer(en_sent, text_target=hin_tran)
tkn_in

{'input_ids': [653, 20566, 72, 144, 4384, 2141, 23, 26, 932, 4575, 23479, 8, 144, 32720, 54, 72, 4, 8260, 37740, 16, 10, 3098, 8260, 11479, 44, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [356, 27275, 10265, 15, 38, 7002, 5, 25, 217, 932, 2777, 4240, 14749, 4430, 1194, 1231, 10523, 130, 14749, 879, 4807, 32925, 12, 28, 44, 3, 0]}

In [14]:
# to see the target tokenizer
with tokenizer.as_target_tokenizer():
  print(tokenizer(["मैने कहा, “हाँ, हमें दूसरी क्लासों को दिखाना है"]))

{'input_ids': [[4940, 140, 2, 31, 5755, 2, 185, 1049, 6057, 86, 18, 3201, 5, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}




In [15]:
wrong_targets = tokenizer(hin_tran)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(tkn_in["labels"]))

['▁', 'ह', 'म', 'ा', 'र', 'ी', '▁', 'वि', 'ग', 'त', '▁', 'र', 'ा', 'ज', 'न', 'ी', 'ति', '▁', 'क', 'ी', '▁', 'ए', 'क', '▁', 'वि', 'र', 'ा', 'सत', '▁', 'है', '▁', 'कि', '▁', 'ह', 'म', 'ा', 'र', 'े', '▁10', '▁', 'प', '्', 'र', '<unk>', '▁', 'स', 'ं', 'ग', '्', 'र', 'ह', '▁', 'ब', '्', 'र', 'ि', 'ट', 'िश', '▁', 'द', '्', 'व', 'ी', 'प', '▁', 'स', 'म', '<unk>', '▁', '<unk>', 'ा', '▁', 'प', 'ू', 'र', '्', '<unk>', 'र', '्', 'त', 'ी', '▁', 'ब', '्', 'र', 'ि', 'ट', 'िश', '▁', 'स', 'ा', 'म', 'ा', 'ज', '्', 'य', '▁', 'से', '▁', 'है', 'ं', '▁', '.', '</s>']
['▁हमारी', '▁विगत', '▁राजनीति', '▁की', '▁एक', '▁विरासत', '▁है', '▁कि', '▁हमारे', '▁10', '▁प्रतिशत', '▁संग्रह', '▁ब्रिटिश', '▁द्वीप', '▁समूह', '▁तथा', '▁पूर्ववर्त', 'ी', '▁ब्रिटिश', '▁सा', 'मा', 'ज्य', '▁से', '▁हैं', '▁', '.', '</s>']


In [16]:
src_lang, tgt_lang = "en", "hi"
max_input_len, max_target_len = 128, 128

def preprocess_data(examples):
  inputs = [ex[src_lang] for ex in examples["translation"]]
  targets = [ex[tgt_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_len, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [17]:
preprocess_data(hin_en_data_train[:2])

{'input_ids': [[3569, 74, 1137, 1406, 1857, 292, 7, 2953, 19, 2577, 14565, 21, 4, 9774, 8, 371, 14702, 3865, 44, 3, 0], [5112, 7, 3346, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[141, 48096, 69, 10661, 4644, 1185, 8111, 9333, 1185, 18872, 69, 24218, 678, 4678, 12, 140, 25, 214, 499, 51284, 48096, 15, 5550, 11, 38, 2180, 22827, 22685, 245, 44, 3, 0], [3437, 33, 2553, 0]]}

In [38]:
# So, our tokenized dataset will be :
tokenized_dataset = split_dataset.map(preprocess_data, batched=True, remove_columns=split_dataset["train"].column_names)


In [23]:
# apna model
model = AutoModelForSeq2SeqLM.from_pretrained(model_chk)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [25]:
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay
batch_size = 16
lr = 2e-5
weight_decay = 0.01
num_train_epochs = 1

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,return_tensors="pt")

generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,return_tensors="pt", pad_to_multiple_of=128)

In [40]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

## Full Finetuning...

In [43]:
# finetuning
training_args = Seq2SeqTrainingArguments(
    output_dir="./my_fine_tuned_helsinki_hindi_translation_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    optim="adamw_torch",
    fp16=True,
)


# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["validation"],
#     # compute_metrics=compute_metrics,
# )
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3688,3.182079


TrainOutput(global_step=13695, training_loss=3.535065390289853, metrics={'train_runtime': 1852.6468, 'train_samples_per_second': 118.268, 'train_steps_per_second': 7.392, 'total_flos': 3949398921314304.0, 'train_loss': 3.535065390289853, 'epoch': 1.0})

In [45]:
model_path = f'./LLMs/'

trainer.save_model(model_path)
trainer.save_state()


In [53]:
finetuned_model_path = "./my_fine_tuned_helsinki_hindi_translation_model"
tokenizer.save_pretrained(finetuned_model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_chk, return_tensors="pt")
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(finetuned_model_path)

## Testing our FineTuned model!

In [54]:
stmt = "I like this exercise very much."
inputs = tokenizer(stmt, return_tensors="pt", truncation=True, padding=True)
prediction = fine_tuned_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

In [59]:
import pandas as pd
pdf = pd.DataFrame(
    zip(stmt, tokenizer.batch_decode(prediction, skip_special_tokens=True)),
    columns=["id", "translation"],
)
display(pdf)

Unnamed: 0,id,translation
0,I,मैं इस अभ्यास को बहुत ज्यादा पसंद करता हूँ.


In [61]:
tokenizer_ft = AutoTokenizer.from_pretrained(finetuned_model_path)
translator_ft = pipeline("translation", model=fine_tuned_model, tokenizer=tokenizer_ft)
translator_ft("I like this")



[{'translation_text': 'मुझे ये पसंद है'}]