In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import pandas as pd
from datasets import Dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments,EvalPrediction,pipeline
import torch

In [3]:
data=pd.read_csv("en-ur.csv")

# DATA PREPROCESSING

In [4]:
# droping null values
data=data.dropna()

In [6]:
import re

def preprocess_sentence(s):
    # Convert the sentence to lowercase to ensure uniform text processing.
    s = s.lower()

    # Insert spaces around punctuation marks to facilitate tokenization.
    # This ensures that punctuation is treated as separate tokens.
    s = re.sub(r"([?.!,¿])", r" \1 ", s)

    # Replace consecutive whitespace characters with a single space to normalize spacing.
    # This step consolidates multiple spaces into one.
    s = re.sub(r'[" "]+', " ", s)

    # Remove leading and trailing whitespace from the sentence for clean output.
    s = s.strip()

    return s


In [7]:
# apply pre processing
data['English_clean']=data['English'].apply(preprocess_sentence)
data['Urdu_clean']=data['Urdu'].apply(preprocess_sentence)

In [7]:
def tag_target_sentences(sentences):
    # Use the `map` function to apply a lambda function to each sentence in the input list.
    # The lambda function adds '<sos>' (start of sentence) at the beginning and '<eos>' (end of sentence) at the end.
    # The sentences are joined into a single string with spaces separating the tokens.
    tagged_sentences = map(lambda s: ' '.join(['<sos>', s, '<eos>']), sentences)

    # Convert the map object to a list and return it.
    # The result is a list of sentences with the added start and end tokens.
    return list(tagged_sentences)


In [8]:
# Extract a list of English sentences from the 'English_clean' column of the data.
input_ = list(data['English_clean'])

# Tag each Urdu sentence in the 'Urdu_clean' column of the data with start and end tokens.
# Convert the tagged Urdu sentences into a list.
target = tag_target_sentences(list(data['Urdu_clean']))


# DATA TOKENIZATION

In [9]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Define the pre-trained model and tokenizer
model_name = "google/mt5-small"
# Load the pre-trained MT5 model for conditional generation tasks (e.g., translation).
model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Load the pre-trained tokenizer associated with the MT5 model to convert text into tokens.
tokenizer = MT5Tokenizer.from_pretrained(model_name)

# Add a padding token to the tokenizer
# Padding tokens are used to ensure that all sequences in a batch have the same length.
# This is important for batch processing and maintaining consistent input sizes.
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Reload the model with updated tokenizer settings
# Since the tokenizer has been updated to include a padding token,
# the model's token embeddings need to be resized to accommodate the new token.
model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Adjust the model's token embeddings to include the newly added padding token.
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Embedding(250101, 512)

In [10]:
# Define your input and target text lists
# These lists contain the English and Urdu sentences that will be used for training or evaluation.
input_texts = list(data['English_clean'])
target_texts = list(data['Urdu_clean'])

# Define a function to tokenize text
def tokenize_data(inputs, targets, tokenizer, max_length=128):

    # Tokenize the input texts
    # Convert the input texts into token IDs with padding and truncation as needed.
    # `return_tensors='pt'` ensures the output is in the PyTorch tensor format.
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')

    # Tokenize the target texts
    # Use the tokenizer's target tokenizer to handle target texts, which may have different tokenization requirements.
    # This ensures that the labels are also padded and truncated appropriately.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')

    # Add the tokenized labels to the model inputs
    # `labels['input_ids']` contains the token IDs for the target texts.
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Tokenize the data
# Apply the `tokenize_data` function to the input and target texts using the tokenizer.
# This prepares the data for model training or evaluation.
tokenized_data = tokenize_data(input_texts, target_texts, tokenizer)

# Print sample tokenized data
# Display the token IDs for a sample input and target sequence to verify the tokenization process.
print("Sample input tokens:", tokenized_data['input_ids'][0])
print("Sample target tokens:", tokenized_data['labels'][0])




Sample input tokens: tensor([   339,    466,    348,    772,  73380, 151271,      1, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100, 250100,
        250100, 250

In [11]:
from datasets import Dataset
import pandas as pd
# Create a DataFrame from the input and target texts, which includes columns for input and target text.
# This DataFrame is then converted into a Hugging Face Dataset object.
dataset = Dataset.from_pandas(pd.DataFrame({
    'input_text': input_texts,   # Column for input texts (e.g., English sentences)
    'target_text': target_texts  # Column for target texts (e.g., Urdu sentences)
}))

In [12]:
def preprocess_data(examples):
    # Extract input texts and target texts from the batch of examples
    inputs = examples['input_text']
    targets = examples['target_text']

    # Tokenize the input texts
    # Convert the input texts into token IDs with specified maximum length, padding, and truncation.
    # `padding='max_length'` ensures that all tokenized inputs have the same length.
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')

    # Tokenize the target texts
    # Use the tokenizer's target tokenizer to handle target texts, applying the same length, padding, and truncation.
    # This ensures consistency between inputs and targets.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    # Add the tokenized labels to the model inputs dictionary
    # `labels['input_ids']` contains the token IDs for the target texts.
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Apply the preprocessing function to the entire dataset
# The `map` function is used to apply `preprocess_data` to all examples in the dataset in batches.
# `batched=True` ensures that the function processes multiple examples at once for efficiency.
dataset = dataset.map(preprocess_data, batched=True)

# Set the format of the dataset to PyTorch tensors
# `type='torch'` specifies that the dataset should be in PyTorch format, making it compatible with PyTorch models.
# The specified columns ('input_ids', 'attention_mask', 'labels') will be included in the dataset.
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the dataset into training and validation sets
# `train_test_split` divides the dataset into training and test (validation) subsets.
# `test_size=0.1` specifies that 10% of the data should be used for validation.
train_test_split = dataset.train_test_split(test_size=0.1)

# Extract the training and validation subsets
# `train_data` contains the training set, while `val_data` contains the validation set.
train_data = train_test_split['train']
val_data = train_test_split['test']


Map:   0%|          | 0/24524 [00:00<?, ? examples/s]

In [13]:
train_data

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 22071
})

In [15]:
from transformers import TrainingArguments, Trainer

# Define training arguments (adjust these as needed)
training_args = TrainingArguments(
    output_dir="./outputs",  # Directory where model checkpoints and other outputs will be saved.
    num_train_epochs=5,  # Number of epochs for training. Adjust based on how long you want to train the model.
    per_device_train_batch_size=8,  # Batch size for each device (GPU/CPU). Modify based on available GPU memory and dataset size.
    save_steps=1000,  # Number of steps between saving model checkpoints. Helps in resuming training if interrupted.
    save_total_limit=2,  # Maximum number of checkpoints to keep. Older checkpoints will be deleted to manage storage.
)

# Create a Trainer instance
trainer = Trainer(
    model=model,  # The model to be trained.
    args=training_args,  # Training arguments specified above.
    train_dataset=train_data  # The dataset used for training the model.
)

# Start training the model
# This method initiates the training process based on the provided arguments and dataset.
trainer.train()

# Save the fine-tuned model (optional)
# After training, the model is saved to the specified directory. This allows for later use or deployment.
model.save_pretrained("./en_ur_translator")


Step,Training Loss
500,16.1529
1000,0.9463
1500,0.6629
2000,0.3946
2500,0.3182
3000,0.2836
3500,0.2609
4000,0.2421
4500,0.232
5000,0.2233


In [16]:
tokenizer.save_pretrained("./en_ur_translator")


('./en_ur_translator/tokenizer_config.json',
 './en_ur_translator/special_tokens_map.json',
 './en_ur_translator/spiece.model',
 './en_ur_translator/added_tokens.json')

In [17]:
from transformers import pipeline

translation_pipeline = pipeline('translation', model='./en_ur_translator', tokenizer='./en_ur_translator')

translated_text = translation_pipeline("how are you?")
print(translated_text)


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'translation_text': 'آپ کیوں ہیں؟'}]


In [18]:
# Import the necessary libraries
from google.colab import files
import shutil

# Create a zip archive of the folder
shutil.make_archive("en_ur_translator", 'zip', "en_ur_translator")




'/content/en_ur_translator.zip'

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
import shutil

# Define the source and destination paths
source_zip_file = "./en_ur_translator.zip"
destination_folder = "/content/drive/My Drive/en_ur_translator.zip"

# Move the zip file to Google Drive
shutil.copy(source_zip_file, destination_folder)


'/content/drive/My Drive/en_ur_translator.zip'