In [1]:

import random
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)


import os
import torch
from transformers import (
    MarianMTModel, 
    MarianTokenizer,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, load_metric,load_dataset
import evaluate





In [2]:
if torch.cuda.is_available():
    print("GPU is available and ready!")
else: 
    print("Please enable GPU in Runtime settings.")

GPU is available and ready!


In [None]:
import os


dataset_path = "Tamil-English-Dataset-master"  


dataset_files = os.listdir(dataset_path)

print("Files in Dataset Directory:")
print(dataset_files)


Files in Dataset Directory:
['Dataset', 'README.md']


In [None]:

dataset_folder_path = os.path.join(dataset_path, "Dataset")


dataset_folder_files = os.listdir(dataset_folder_path)

print("Files in the 'Dataset' Folder:")
print(dataset_folder_files)

Files in the 'Dataset' Folder:
['aligned_english.txt', 'aligned_tamil.txt', 'data.en1', 'data.en2', 'data.en3', 'data.en4', 'data.en5', 'data.en6', 'data.ta1', 'data.ta2', 'data.ta3', 'data.ta4', 'data.ta5', 'data.ta6', 'merged_english.txt', 'merged_tamil.txt', 'test_english.txt', 'test_tamil.txt', 'train_english.txt', 'train_tamil.txt', 'val_english.txt', 'val_tamil.txt']


In [None]:

def read_file_sample(file_path, num_lines=5):
    print(f"Reading {file_path}:")
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_lines:
                break
            print(line.strip())

tamil_file = os.path.join(dataset_folder_path, "data.ta1")  
english_file = os.path.join(dataset_folder_path, "data.en1")  


In [None]:

read_file_sample(tamil_file)
read_file_sample(english_file)

Reading Tamil-English-Dataset-master\Dataset\data.ta1:
ராஜாவாகிய ஆகாஸ் அரசாளும்போது தம்முடைய பாதகத்தினால் எறிந்துபோட்ட சகல பணிமுட்டுகளையும் முஸ்திப்பாக்கிப் பரிசுத்தம்பண்ணினோம்; இதோ , அவைகள் கர்த்தரின் ஆலயத்திற்கு முன்பாக இருக்கிறது என்றார்கள் .
சர்வதேச நாணய நிதியம் இலங்கைக்கு கடன் வழங்கினால் இதே போன்ற நிபந்தனைகள் திணிக்கப்படும் .
தற்போது அதற்கு எதிராக வாதாடுகிறார் சர்வதேச சட்டத்தை செயல்படுத்துவதற்குப் பதிலாக புதிய சட்டம் உருவாக்கப்பட்டு நிறுவப்பட வேண்டும் என்று எழுதுகிறார் .
அமெரிக்காவின் மூன்றாம் பெரிய கார் தயாரிப்பு நிறுவனமான கிறைஸ்லர் வியாழனன்று நியூ யோர்க்கில் திவாலடைந்ததற்காக மனு செய்தது; அத்தியாயம் 11 ன் படி மறு சீரமைத்து வெளிவரும் வரை அது தன்னுடைய உற்பத்தி நிலையங்களை மூடும் என்றும் அறிவித்துள்ளது .
மேலும் இனைவிட்டு தலிபானால் வெளியேற்றப்பட்ட 1995 இல் இருந்து ஈரானில் கூடுதலாக வாழ்ந்துவந்துள்ளார் .
Reading Tamil-English-Dataset-master\Dataset\data.en1:
moreover all the vessels , which king ahaz in his reign did cast away in his transgression , have we prepared and sanctified , 

In [None]:

def merge_files(input_files, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in input_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line.strip() + '\n')

merged_tamil_file = os.path.join(dataset_folder_path, "merged_tamil.txt")
merged_english_file = os.path.join(dataset_folder_path, "merged_english.txt")

In [None]:


tamil_files = [os.path.join(dataset_folder_path, f) for f in dataset_folder_files if f.startswith("data.ta")]
english_files = [os.path.join(dataset_folder_path, f) for f in dataset_folder_files if f.startswith("data.en")]

merge_files(tamil_files, merged_tamil_file)
merge_files(english_files, merged_english_file)

print(f"Tamil sentences merged into: {merged_tamil_file}")
print(f"English sentences merged into: {merged_english_file}")

Tamil sentences merged into: Tamil-English-Dataset-master\Dataset\merged_tamil.txt
English sentences merged into: Tamil-English-Dataset-master\Dataset\merged_english.txt


In [None]:

tamil_line_count = sum(1 for line in open(merged_tamil_file, 'r', encoding='utf-8'))
english_line_count = sum(1 for line in open(merged_english_file, 'r', encoding='utf-8'))

print(f"Number of lines in Tamil file: {tamil_line_count}")
print(f"Number of lines in English file: {english_line_count}")

Number of lines in Tamil file: 289451
Number of lines in English file: 289451


In [None]:

aligned_tamil_file = "Tamil-English-Dataset-master/Dataset/aligned_tamil.txt"
aligned_english_file = "Tamil-English-Dataset-master/Dataset/aligned_english.txt"


def align_files(tamil_path, english_path, output_tamil, output_english):
    with open(tamil_path, 'r', encoding='utf-8') as tamil_file, \
         open(english_path, 'r', encoding='utf-8') as english_file:
        tamil_lines = tamil_file.readlines()
        english_lines = english_file.readlines()


    min_lines = min(len(tamil_lines), len(english_lines))
    aligned_tamil_lines = tamil_lines[:min_lines]
    aligned_english_lines = english_lines[:min_lines]


    with open(output_tamil, 'w', encoding='utf-8') as tamil_out, \
         open(output_english, 'w', encoding='utf-8') as english_out:
        tamil_out.writelines(aligned_tamil_lines)
        english_out.writelines(aligned_english_lines)

    print(f"Aligned Tamil file saved to: {output_tamil}")
    print(f"Aligned English file saved to: {output_english}")
    print(f"Number of aligned sentences: {min_lines}")

In [None]:

align_files(merged_tamil_file, merged_english_file, aligned_tamil_file, aligned_english_file)

Aligned Tamil file saved to: Tamil-English-Dataset-master/Dataset/aligned_tamil.txt
Aligned English file saved to: Tamil-English-Dataset-master/Dataset/aligned_english.txt
Number of aligned sentences: 289451


In [None]:

aligned_tamil_file = "Tamil-English-Dataset-master/Dataset/aligned_tamil.txt"
aligned_english_file = "Tamil-English-Dataset-master/Dataset/aligned_english.txt"


train_tamil_file = "Tamil-English-Dataset-master/Dataset/train_tamil.txt"
train_english_file = "Tamil-English-Dataset-master/Dataset/train_english.txt"
val_tamil_file = "Tamil-English-Dataset-master/Dataset/val_tamil.txt"
val_english_file = "Tamil-English-Dataset-master/Dataset/val_english.txt"
test_tamil_file = "Tamil-English-Dataset-master/Dataset/test_tamil.txt"
test_english_file = "Tamil-English-Dataset-master/Dataset/test_english.txt"

def normalize_text(lines):
    normalized = []
    for line in lines:
        line = line.strip()  
        line = line.replace("“", '"').replace("”", '"')  
        line = line.replace("‘", "'").replace("’", "'")  
        normalized.append(line)
    return normalized

with open(aligned_tamil_file, 'r', encoding='utf-8') as tamil_file, \
     open(aligned_english_file, 'r', encoding='utf-8') as english_file:
    tamil_lines = normalize_text(tamil_file.readlines())
    english_lines = normalize_text(english_file.readlines())


data = list(zip(tamil_lines, english_lines))
random.shuffle(data)

train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]

def write_data(file_path, data, index):
    with open(file_path, 'w', encoding='utf-8') as file:
        for pair in data:
            file.write(pair[index] + '\n')

write_data(train_tamil_file, train_data, 0)
write_data(train_english_file, train_data, 1)
write_data(val_tamil_file, val_data, 0)
write_data(val_english_file, val_data, 1)
write_data(test_tamil_file, test_data, 0)
write_data(test_english_file, test_data, 1)

print("Data split completed!")
print(f"Training Tamil: {train_tamil_file}")
print(f"Training English: {train_english_file}")
print(f"Validation Tamil: {val_tamil_file}")
print(f"Validation English: {val_english_file}")
print(f"Test Tamil: {test_tamil_file}")
print(f"Test English: {test_english_file}")

Data split completed!
Training Tamil: Tamil-English-Dataset-master/Dataset/train_tamil.txt
Training English: Tamil-English-Dataset-master/Dataset/train_english.txt
Validation Tamil: Tamil-English-Dataset-master/Dataset/val_tamil.txt
Validation English: Tamil-English-Dataset-master/Dataset/val_english.txt
Test Tamil: Tamil-English-Dataset-master/Dataset/test_tamil.txt
Test English: Tamil-English-Dataset-master/Dataset/test_english.txt


In [17]:
!huggingface-cli login <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>

#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

The syntax of the command is incorrect.


In [None]:
MODEL_NAME = "Helsinki-NLP/opus-mt-ta-en" 
OUTPUT_DIR = "./tamil_english_translation_model"
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


https://stackoverflow.com/questions/70043467/how-to-run-huggingface-helsinki-nlp-models

https://huggingface.co/Helsinki-NLP/opus-mt-es-en

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")





In [15]:
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
BATCH_SIZE = 4  
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3


train_tamil_file = "Tamil-English-Dataset-master/Dataset/train_tamil.txt"
train_english_file = "Tamil-English-Dataset-master/Dataset/train_english.txt"
val_tamil_file = "Tamil-English-Dataset-master/Dataset/val_tamil.txt"
val_english_file = "Tamil-English-Dataset-master/Dataset/val_english.txt"

In [None]:

def load_translation_dataset(train_tamil, train_english, val_tamil, val_english):
    def read_lines(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]
    
    train_tamil_lines = read_lines(train_tamil)
    train_english_lines = read_lines(train_english)
    val_tamil_lines = read_lines(val_tamil)
    val_english_lines = read_lines(val_english)
    

    train_dataset = Dataset.from_dict({
        "translation": [
            {"ta": tamil, "en": english} 
            for tamil, english in zip(train_tamil_lines, train_english_lines)
        ]
    })
    
    val_dataset = Dataset.from_dict({
        "translation": [
            {"ta": tamil, "en": english} 
            for tamil, english in zip(val_tamil_lines, val_english_lines)
        ]
    })

    
    
    return train_dataset, val_dataset

In [None]:

train_dataset, val_dataset = load_translation_dataset(
    train_tamil_file, 
    train_english_file, 
    val_tamil_file, 
    val_english_file
)

In [None]:

def preprocess_function(examples):
    inputs = [ex["ta"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding="max_length"
    )
    

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=128, 
            truncation=True, 
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:

tokenized_train = train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=train_dataset.column_names
)
tokenized_val = val_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=val_dataset.column_names
)

Map:   0%|          | 0/231560 [00:00<?, ? examples/s]



Map:   0%|          | 0/28945 [00:00<?, ? examples/s]

In [9]:
metric = evaluate.load("sacrebleu")

In [31]:
metric

EvaluationModule(name: "sacrebleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'e

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU score 
    result = metric.compute(
        predictions=decoded_preds, 
        references=[[label] for label in decoded_labels]
    )
    return {"bleu": result["score"]}

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    predict_with_generate=True,
    fp16=torch.cuda.is_available()
)




In [None]:

data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    padding=True
)

In [None]:

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:

trainer.train()
 


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdanushidk507[0m ([33mdanushidk[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

  0%|          | 0/173670 [00:00<?, ?it/s]

{'loss': 6.8607, 'grad_norm': 69.43547821044922, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.0}
{'loss': 6.4719, 'grad_norm': 77.29963684082031, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.0}
{'loss': 5.8922, 'grad_norm': 64.58267211914062, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 5.1848, 'grad_norm': 82.10954284667969, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.0}
{'loss': 3.7672, 'grad_norm': 60.900882720947266, 'learning_rate': 1.8000000000000001e-06, 'epoch': 0.0}
{'loss': 2.6754, 'grad_norm': 21.2751407623291, 'learning_rate': 2.2e-06, 'epoch': 0.0}
{'loss': 2.6156, 'grad_norm': 7.207292079925537, 'learning_rate': 2.6e-06, 'epoch': 0.0}
{'loss': 2.0553, 'grad_norm': 4.838811874389648, 'learning_rate': 3e-06, 'epoch': 0.0}
{'loss': 2.0039, 'grad_norm': 12.880924224853516, 'learning_rate': 3.4000000000000005e-06, 'epoch': 0.0}
{'loss': 1.8685, 'grad_norm': 4.319309234619141, 'learning_rate': 3.8000000000000005e-06, 'epoch': 0.0}
{'loss'

  0%|          | 0/7237 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}


{'eval_loss': 0.888680100440979, 'eval_bleu': 4.581187335944809, 'eval_runtime': 54971.4931, 'eval_samples_per_second': 0.527, 'eval_steps_per_second': 0.132, 'epoch': 1.0}
{'loss': 0.8597, 'grad_norm': 2.9155008792877197, 'learning_rate': 1.3373563550268522e-05, 'epoch': 1.0}
{'loss': 0.8494, 'grad_norm': 2.9210779666900635, 'learning_rate': 1.3372408615811055e-05, 'epoch': 1.0}
{'loss': 1.048, 'grad_norm': 3.1685376167297363, 'learning_rate': 1.3371253681353584e-05, 'epoch': 1.0}
{'loss': 0.9247, 'grad_norm': 3.434953212738037, 'learning_rate': 1.3370098746896115e-05, 'epoch': 1.0}
{'loss': 0.9478, 'grad_norm': 2.806818723678589, 'learning_rate': 1.3368943812438645e-05, 'epoch': 1.0}
{'loss': 0.9594, 'grad_norm': 2.968289375305176, 'learning_rate': 1.3367788877981176e-05, 'epoch': 1.0}
{'loss': 0.7778, 'grad_norm': 2.2733330726623535, 'learning_rate': 1.3366633943523705e-05, 'epoch': 1.0}
{'loss': 0.8797, 'grad_norm': 2.773956537246704, 'learning_rate': 1.3365479009066236e-05, 'epoch

  0%|          | 0/7237 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}


{'eval_loss': 0.761432409286499, 'eval_bleu': 8.80836890337671, 'eval_runtime': 29373.9189, 'eval_samples_per_second': 0.985, 'eval_steps_per_second': 0.246, 'epoch': 2.0}
{'loss': 0.6979, 'grad_norm': 2.205644369125366, 'learning_rate': 6.69007333833805e-06, 'epoch': 2.0}
{'loss': 0.7122, 'grad_norm': 3.1897048950195312, 'learning_rate': 6.68891840388058e-06, 'epoch': 2.0}
{'loss': 0.8986, 'grad_norm': 2.4829680919647217, 'learning_rate': 6.687763469423111e-06, 'epoch': 2.0}
{'loss': 1.0338, 'grad_norm': 2.642089366912842, 'learning_rate': 6.686608534965642e-06, 'epoch': 2.0}
{'loss': 0.8896, 'grad_norm': 2.6920394897460938, 'learning_rate': 6.6854536005081715e-06, 'epoch': 2.0}
{'loss': 0.7803, 'grad_norm': 2.4153690338134766, 'learning_rate': 6.684298666050702e-06, 'epoch': 2.0}
{'loss': 0.854, 'grad_norm': 2.0676066875457764, 'learning_rate': 6.683143731593232e-06, 'epoch': 2.0}
{'loss': 0.8506, 'grad_norm': 3.3116519451141357, 'learning_rate': 6.681988797135763e-06, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}


{'loss': 0.7389, 'grad_norm': 2.0653605461120605, 'learning_rate': 8.315528093780679e-09, 'epoch': 3.0}


  0%|          | 0/7237 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}


{'eval_loss': 0.7282079458236694, 'eval_bleu': 9.95327477720364, 'eval_runtime': 11297.2064, 'eval_samples_per_second': 2.562, 'eval_steps_per_second': 0.641, 'epoch': 3.0}
{'train_runtime': 143562.5495, 'train_samples_per_second': 4.839, 'train_steps_per_second': 1.21, 'train_loss': 0.9102490701185042, 'epoch': 3.0}


TrainOutput(global_step=173670, training_loss=0.9102490701185042, metrics={'train_runtime': 143562.5495, 'train_samples_per_second': 4.839, 'train_steps_per_second': 1.21, 'total_flos': 2.354851614818304e+16, 'train_loss': 0.9102490701185042, 'epoch': 3.0})

In [None]:

trainer.save_model(OUTPUT_DIR)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
