In [None]:
!pip install -q -U accelerate==0.27.1
!pip install -q -U transformers==4.38.0

In [None]:
!pip install -q -U datasets==2.16.1

In [3]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

In [4]:
checkpoint = "Mr-Vicky-01/English-Tamil-Translator"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def language_translator(text):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    # model = AutoModelForSeq2SeqLM.from_pretrained("finetune-EN-to-Ta/")
    tokenized = tokenizer([text], return_tensors='pt')
    out = model.generate(**tokenized, max_length=128)
    return tokenizer.decode(out[0],skip_special_tokens=True)

In [None]:
text_to_translate = "i have to play football now!"
output = language_translator(text_to_translate)
print(output)

In [5]:
raw_datasets = load_dataset("Hemanth-thunder/en_ta")

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'en', 'ta'],
        num_rows: 285630
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'en', 'ta'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'en', 'ta'],
        num_rows: 2000
    })
})

In [7]:
raw_datasets['train'][0]

{'Unnamed: 0': 0,
 'en': "MMA vice president Qazi Hussain Ahmad declared last month: 'We are not extremists.\n",
 'ta': 'MMA கட்சியின் துணைத்தலைவர் க்வாஸி ஹுசேன் அகமத் சென்ற மாதம் பின்வருமாறு அறிவித்தார்: ``நாங்கள் தீவிரவாதிகள் அல்ல.\n'}

In [8]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "ta"


def preprocess_function(examples):
    inputs = [ex for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
preprocess_function(raw_datasets["train"][:1])

{'input_ids': [[128022, 100, 8087, 45944, 20120, 1315, 2956, 176, 9652, 40, 37513, 30298, 241, 11469, 119530, 9, 244, 41895, 4234, 4632, 25335, 33544, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[128022, 100, 8087, 3000, 23198, 45900, 19363, 40323, 2895, 7128, 13378, 3000, 12037, 9110, 21477, 4685, 23618, 6133, 4378, 12882, 82091, 3068, 6285, 89996, 10093, 28215, 32018, 73075, 9704, 9429, 61507, 103297, 9, 22, 56797, 24332, 7960, 25706, 94292, 9110, 5174, 6699, 112439, 5, 2]]}

In [10]:
tokenized_test = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/285630 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
tokenized_test

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'en', 'ta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 285630
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'en', 'ta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'en', 'ta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [26]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_args = Seq2SeqTrainingArguments(
    output_dir="finetuned-EN-to-Ta",  
    overwrite_output_dir=True,  
    do_train=True,  
    logging_dir="logs",  
    logging_steps=100,  # Increase logging frequency for more insights
    save_steps=5000000,  # Save model checkpoints more frequently
    save_total_limit=1,  # Keep more checkpoints
    num_train_epochs=1,  # Extend training to allow for more optimization
    per_device_train_batch_size=8,  # Increase batch size for more efficient training
    gradient_accumulation_steps=4,  # Increase gradient accumulation for stable training
    learning_rate=4e-5,  # Slightly lower learning rate to fine-tune more gently
    warmup_steps=2500,  # Increase warmup steps for better adaptation
    weight_decay=0.01,  
    adam_beta1=0.9,  
    adam_beta2=0.98,  
    adam_epsilon=1e-8,  
    lr_scheduler_type="linear",  
    predict_with_generate=True,  
    fp16=True,  
    seed=42,  
    report_to="wandb",  
    run_name="finetuned-EN-to-Ta",  
)

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [28]:
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=tokenized_test["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Step,Training Loss
100,1.4088
200,1.1802
300,1.1014
400,1.0331
500,0.9521
600,0.8918
700,0.8184
800,0.7486


In [16]:
trainer.model.save_pretrained('finetune-English-to-Tamil')

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}


In [19]:
model1 = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/finetune-English-to-Tamil")

In [20]:
def language_translator(text):
    # model = AutoModelForSeq2SeqLM.from_pretrained("finetune-EN-to-Ta/")
    tokenized = tokenizer([text], return_tensors='pt')
    out = model1.generate(**tokenized, max_length=128)
    return tokenizer.decode(out[0],skip_special_tokens=True)

In [25]:
# text_to_translate = raw_datasets["test"]["en"][5]
output = language_translator("ran out of memory")
print(output)

நினைவில்லாமல் ஓடினார்கள்!


In [None]:
raw_datasets["test"][5]