In [1]:
!pip install datasets transformers[sentencepiece] sacrebleu -q



In [3]:
# !pip install --upgrade transformers

In [2]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-ur'

# Helsinki-NLP/opus-mt-en-ur model

Source: https://huggingface.co/Helsinki-NLP/opus-mt-en-ur

# The Dataset

Source: https://huggingface.co/datasets/cfilt/iitb-english-hindi

Source: https://huggingface.co/datasets/HaiderSultanArc/MT-Urdu-English/viewer/default/train

In [4]:
raw_datasets = load_dataset('HaiderSultanArc/MT-Urdu-English')

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['en', 'ur'],
        num_rows: 5646138
    })
    test: Dataset({
        features: ['en', 'ur'],
        num_rows: 1411535
    })
})

In [6]:
raw_datasets['train'][0]

{'en': 'So, Are You In The Market?', 'ur': '"تم یہاں بازار میں ؟"'}

# Small Dataset

In [119]:
from datasets import load_dataset

# Load the dataset
raw_datasets = load_dataset('HaiderSultanArc/MT-Urdu-English')

# Take a small subset for testing/debugging
small_en_ur_train_dataset = raw_datasets['train'].shuffle(seed=42).select([i for i in range(1000)])  # Adjust the number as needed
small_en_ur_test_dataset = raw_datasets['test'].shuffle(seed=42).select([i for i in range(500)])  # Adjust the number as needed

# Display the small datasets
small_en_ur_dataset = {
    "train": small_en_ur_train_dataset,
    "test": small_en_ur_test_dataset,
}

small_en_ur_dataset

{'train': Dataset({
     features: ['en', 'ur'],
     num_rows: 1000
 }),
 'test': Dataset({
     features: ['en', 'ur'],
     num_rows: 500
 })}

In [120]:
small_en_ur_dataset['train'][0]

{'en': 'And why are the kneeling on your right and on your left.',
 'ur': '’’اور آپ ﷺ نے اپنی بائیں ہتھیلی کو بائیں ران اور گھٹنے پر رکھا۔‘‘'}

# Preprocessing the data

In [121]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer("Hi, this is a sentence!")

tokenizer("Hi, this is a sentence!", "This is another sentence.")

with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hi, this is a sentence!", "This is another sentence."]))

In [122]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "ur"

# Define your preprocess_function as before
def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [123]:
preprocess_function(small_en_ur_dataset["train"][:2])

{'input_ids': [[57, 1082, 56, 3, 19626, 95, 76, 418, 7, 95, 76, 918, 5, 0], [1598, 108, 52, 82, 24319, 10376, 66, 179, 5, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[19977, 346, 88, 20669, 3292, 202, 3573, 752, 1016, 18, 132, 3032, 25058, 16, 3032, 23430, 4, 16061, 30, 12674, 17537, 0], [139, 20, 18, 257, 771, 2426, 1129, 11, 710, 1444, 8, 0]]}

In [128]:
# Apply preprocess_function to train and test datasets
tokenized_train_datasets = small_en_ur_dataset["train"].map(preprocess_function, batched=True)

In [129]:
tokenized_test_datasets = small_en_ur_dataset["test"].map(preprocess_function, batched=True)

In [130]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-ur.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [131]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [132]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [133]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [134]:
train_dataset = model.prepare_tf_dataset(
    tokenized_train_datasets,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)


In [135]:
test_dataset = model.prepare_tf_dataset(
    tokenized_test_datasets,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)


In [136]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
model.fit(x=train_dataset, validation_data=test_dataset, epochs=1)

 3/62 [>.............................] - ETA: 37:51 - loss: 5.0565

In [77]:
model.save_pretrained("tf_model_ur/")