In [None]:
!pip install scikit-learn

In [6]:
import pandas as pd


data = pd.read_csv('/content/drive/MyDrive/EnTa_data.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,en,ta
0,rajini is the one who threatened rajini in aru...,அருணாச்சலம் படத்துல ரஜினியையே மிரட்டுனவரு இவரு...
1,asatya sri sisters sang the song va vathi,வா வாத்தி பாடலை பாடி அசத்திய ஸ்ரீ சகோதரிகள்
2,ban on sale of meat within a km radius around ...,பெங்களூரு விமானநிலையத்தை சுற்றி கி மீ சுற்றளவி...
3,my name is abba paru than me plus actor sundar...,அப்பா பேரு தான் எனக்கு பிளஸ் நடிகர் சுந்தர் நே...
4,nanjil sampat remembered,நாஞ்சில் சம்பத்துக்கு நினைவு திரும்பியது


In [7]:
data.shape

(177075, 2)

In [11]:
from sklearn.model_selection import train_test_split

In [15]:
Train_data = data[0:150000]
Test_data = data[150001:177075]

In [16]:
Train_data.shape,  Test_data.shape

((150000, 2), (27074, 2))

Prepare Train Data

In [17]:
row_indices = range(0, 150000)  # Index range from 0 to 99
example_list = [{'en': row['en'], 'ta': row['ta']} for index, row in Train_data.loc[row_indices].iterrows()]
example_list


from datasets import Dataset

# data is a list of dictionaries
formatted_data = {'en': [example['en'] for example in example_list],
                  'ta': [example['ta'] for example in example_list]}

# Create a dataset from the formatted data
dataset = Dataset.from_dict(formatted_data)

Prepare Test Data

In [18]:
row_indices = range(150001, 177075)  # Index range from 0 to 99
example_list1 = [{'en': row['en'], 'ta': row['ta']} for index, row in Test_data.loc[row_indices].iterrows()]
example_list1


# data is a list of dictionaries
formatted_data1 = {'en': [example['en'] for example in example_list1],
                  'ta': [example['ta'] for example in example_list1]}

# Create a dataset from the formatted data
dataset1 = Dataset.from_dict(formatted_data1)

Load Models and Build

In [19]:
import os
import sys
import transformers
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-mul-en"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [21]:
max_input_length = 128
max_target_length = 128

source_lang = "ta"
target_lang = "en"


def preprocess_function(example):
    inputs = example[source_lang]
    targets = example[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


Tokenize the Data

In [22]:
# Apply preprocess_function using map
Train_tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]



In [23]:
# Apply preprocess_function using map
Test_tokenized_datasets = dataset1.map(preprocess_function, batched=True)

Map:   0%|          | 0/27074 [00:00<?, ? examples/s]



Load The Model

In [24]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model

Downloading tf_model.h5:   0%|          | 0.00/311M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-mul-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

<transformers.models.marian.modeling_tf_marian.TFMarianMTModel at 0x7b91e15397e0>

In [25]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=12)

In [26]:
train_dataset = model.prepare_tf_dataset(
    Train_tokenized_datasets,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [27]:
test_dataset = model.prepare_tf_dataset(
    Test_tokenized_datasets,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [28]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [30]:
model.fit(train_dataset, validation_data=test_dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7b91b8516440>

In [32]:
import os
os.chdir('/content/drive/MyDrive/Langauge_Model_Tamil')

In [33]:
model.save_pretrained("ta_En_translation_tf_model/")

## Model Testing

In [34]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Langauge_Model_Tamil/ta_En_translation_tf_model")


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /content/drive/MyDrive/Langauge_Model_Tamil/ta_En_translation_tf_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [36]:
input_text  = "நாஞ்சில் சம்பத்துக்கு நினைவு திரும்பியது"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)  #(**) is used for unpacking the elements of a dictionary-like object into function arguments
print(out)


#Decode the Target tokens and we can get outputs
tokenizer.decode(out[0], skip_special_tokens=True)

tf.Tensor([[64171  5429     8   181 23992    25    11   123 47283     0 64171 64171]], shape=(1, 12), dtype=int32)


'memory of sampa in nanja'

In [37]:
input_text  = 'வா வாத்தி பாடலை பாடி அசத்திய ஸ்ரீ சகோதரிகள்'

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
#Decode the Target tokens and we can get outputs
tokenizer.decode(out[0], skip_special_tokens=True)

'sri sisters sing vaathi song'