# Step 1: Installing the Require Dependencies
Use the datasets, transformers and accelerate to get the data and perform operations on the model.

In [None]:
!pip install datasets transformers accelerate


#Step 2: Load Dataset
Utilize the Tatoeba dataset for English-to-Urdu translation.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

# Explanation:
The Tatoeba dataset is a multilingual dataset, and we specifically choose English (en) to Urdu (ur) translations.
It will load a train and test split by default.

In [None]:
dataset = load_dataset("tatoeba", lang1="en", lang2="ur")

# Step 3: Load Tokenizer and Model
# Explanation:
 We use the pre-trained English-to-Urdu model from Hugging Face (`Helsinki-NLP/opus-mt-en-ur`).
The tokenizer helps in preparing text for the model, converting words into numerical format.

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-ur"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Step 4: Freeze Most Layers for Efficient Training
#Explanation:
Freezing most layers reduces memory usage and speeds up training.
Only the specified layers are fine-tuned, which is efficient on a CPU.


In [None]:
for param in model.parameters():
    param.requires_grad = False  # Freeze all layers

# Unfreeze specific layers for fine-tuning
trainable_layers = ["encoder.layers", "decoder.layers", "lm_head"]  # Unfreeze high-level layers
for name, param in model.named_parameters():
    if any(layer in name for layer in trainable_layers):
        param.requires_grad = True
        print(f"Unfreezing layer: {name}")  # Debugging to confirm layers are unfrozen


# Step 5: Preprocess the Data
# Explanation:
- `translation` contains both English (en) and Urdu (ur) translations.
- Tokenization converts the text to input IDs.
- Padding ensures all sequences in a batch have the same length, while truncation trims longer sequences.
- `remove_columns` removes the original columns after preprocessing.


In [None]:
# Check if 'test' split exists, if not split manually
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.2)  # 80% train, 20% test

# Preprocess the Data
def preprocess_function(examples):
    # Access the English and Urdu translations
    inputs = [example["en"] for example in examples["translation"]]
    targets = [example["ur"] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["translation"])


# Step 6: Define Training Arguments
# Explanation:
Training arguments define how the model is trained, including learning rate, batch size, and evaluation strategy.


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=5e-4,  # Learning rate for optimization
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    save_total_limit=2,  # Limit on the number of saved checkpoints
    predict_with_generate=True,  # Enable text generation during evaluation
    fp16=False,  # Disable mixed precision (only relevant for GPUs)
    report_to="none",  # Disable logging to third-party platforms
)



# Step 7: Initialize Trainer
# Explanation:
The Trainer class handles the training loop, evaluation, and saving of the model.


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    tokenizer=tokenizer
)

  trainer = Seq2SeqTrainer(


# Step 8: Train the Model
# Explanation:
The `train` method fine-tunes the model using the specified dataset.
After training, the model is saved in the specified output directory.


In [None]:
trainer.train()


# Step 9: Save the Model
# Explanation:
Save the fine-tuned model to a directory for later use.

In [None]:
trainer.save_model("./fine_tuned_en_ur_model")

# Step 9: Test the Fine-Tuned Model


In [None]:
def translate(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(translate("Hi. How may I help you Sir?"))

برائے مہربانی کیا میں آپ کی مدد کروں؟
