In [2]:

from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from sklearn.model_selection import train_test_split
import evaluate
import logging
import os

In [6]:
dataset_paths = {
    "bcn_dev": "corpus.bcn.dev 1k.csv",
    "bcn_test": "corpus.bcn.test 2k.csv",
    "bcn_train": "corpus.bcn.train 166k.csv",
    "en_ta": "en-ta 118k.csv",
    "general": "general_en_ta 87k.csv",
    "ie_tech": "ie_tech 18k.csv",
    "ij_2021": "ij_2021_v1 816k .csv",
    "parallel_gloss": "parallel 8k gloss.xlsx",
}


In [3]:

def load_data(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")

In [4]:

def combine_datasets(dataset_paths):
    datasets = []
    for name, path in dataset_paths.items():
        print(f"Loading {name} from {path}")
        datasets.append(load_data(path))
    return pd.concat(datasets, ignore_index=True)



In [14]:
def preprocess_data(data):
    data = data[['en', 'ta']]
    data.columns = ['english', 'tamil']
    data = data.dropna()
    data = data.sample(frac=1).reset_index(drop=True)
    
    return data


In [None]:

data = combine_datasets(dataset_paths)


Loading bcn_dev from corpus.bcn.dev 1k.csv
Loading bcn_test from corpus.bcn.test 2k.csv
Loading bcn_train from corpus.bcn.train 166k.csv
Loading en_ta from en-ta 118k.csv
Loading general from general_en_ta 87k.csv
Loading ie_tech from ie_tech 18k.csv
Loading ij_2021 from ij_2021_v1 816k .csv
Loading parallel_gloss from parallel 8k gloss.xlsx


In [13]:
data

Unnamed: 0.1,Unnamed: 0,en,ta,0,1,Ranaviru Sewa Authority,ரணவிரு சேவை அதிகார சபை,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0.0,"Now the sons of Saul were Jonathan, and Ishui,...","சவுலுக்கு இருந்த குமாரர்: யோனத்தான், இஸ்வி, மல...",,,,,,,
1,1.0,Ahmadinejad obliquely referred to 'war crimina...,"அஹமதினேஜாத், இஸ்ரேலை ஆதரிக்கும் ""போர் குற்றவா...",,,,,,,
2,2.0,Amnesty International has demanded an investig...,சர்வதேச பொது மன்னிப்பு சபை ஒரு புலன் விசாரணை ...,,,,,,,
3,3.0,"""Effectively, this whole package called the Pa...",''பாலஸ்தீன அரசு என்று கூறப்படும் ஒட்டுமொத்த ந...,,,,,,,
4,4.0,"However, they [the politicians] might come to ...","ஆயினும், [இந்த அரசியல்வாதிகள்] தேர்தல் நெருங்க...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1220470,,,,,,Sri lankan government hospitals,இலங்கையில் உள்ள அரச மருத்துவமனைகள்,,,
1220471,,,,,,OUR VISSION,எங்கள் தூரநோக்கு,,,
1220472,,,,,,A healthier nation that contributes to its eco...,"ஒரு தேசத்தின் பொருளாதாரம், சமூகம், மனஞ்சார்ந்த...",,,
1220473,,,,,,OUR MISSION,குறிக்கோள்,,,


In [15]:
data = preprocess_data(data)

In [16]:
data

Unnamed: 0,english,tamil
0,e - Gov Initiative in Uttar Pradesh\n,e - உத்தரபிரதேசத்தில் அரசு முயற்சி\n
1,The court is under an obligation to undo a wro...,நீதிமன்றத்தின் செயலால் ஒரு தரப்பினருக்கு செய்ய...
2,You should have got the expressway 8-9 years a...,8 - 9 ஆண்டுகளுக்கு முன்பே உங்களுக்கு விரைவுச் ...
3,There are no shortcuts.\n,அதற்கு குறுக்குவழிகள் இல்லை.\n
4,Mr Chatterjee complains that there is no indic...,திரு சாட்டர்ஜி புதிய நடைமுறை மறுபரிசீலனை செய்ய...
...,...,...
1124007,Sixteen families had been made homeless and a ...,"பதினாறு குடும்பங்கள் வீடுகளை இழந்துவிட்டனர், ம..."
1124008,[Provided that if such compensation or any par...,[அத்தகைய இழப்பீடு அல்லது அதன் எந்தப் பகுதியும்...
1124009,"After arrest, the accused persons were brought...","கைது செய்யப்பட்ட பின்னர், குற்றம் சாட்டப்பட்ட ..."
1124010,"Further, School Study Certificates should be c...","மேலும், பள்ளி ஆய்வு சான்றிதழ்கள் தொகுதி கல்வி ..."


In [17]:
data.to_csv(header=True, index=False, path_or_buf='Eng_tam_data.csv')

In [18]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [20]:
df = pd.read_csv('Eng_tam_data.csv')  
print(f"Dataset loaded with {len(df)} rows")

Dataset loaded with 1124012 rows


In [21]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = ' '.join(text.split())
    return text

In [22]:
df['english'] = df['english'].apply(clean_text)
df['tamil'] = df['tamil'].apply(clean_text)

In [23]:
df = df[df['english'].str.len() > 0]
df = df[df['tamil'].str.len() > 0]
print(f"Dataset after cleaning: {len(df)} rows")

Dataset after cleaning: 1124007 rows


In [24]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [25]:
print(f"Train set: {len(train_df)} rows")
print(f"Validation set: {len(val_df)} rows")
print(f"Test set: {len(test_df)} rows")

Train set: 899205 rows
Validation set: 112401 rows
Test set: 112401 rows


In [26]:
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})

In [33]:

token = "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"

In [None]:
model_name = "suriya7/English-to-Tamil"  
tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token=token)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,use_auth_token=token)
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       