Install Dependencies

In [47]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install sacrebleu
!pip install sacremoses




Task 1. Load the Dataset

In [48]:
from datasets import load_dataset

# Load the dataset from Hugging Face Hub
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets
# Using an 80/20 split
split_ratio = 0.8
train_test_split = dataset["train"].train_test_split(test_size=1 - split_ratio, seed=42)

# Rename splits for clarity
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Print dataset info
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(dataset)


Train dataset size: 4004
Validation dataset size: 1002
DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


Task 2. Data Preprocessing

In [49]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer (T5 in this example)
tokenizer_name = "t5-small"  # Replace with a model suitable for Bengali tasks if available
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Preprocessing function
def preprocess_data(example):
    # Tokenize Banglish input
    input_tokens = tokenizer(
        example["rm"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    # Tokenize Bangla target
    target_tokens = tokenizer(
        example["bn"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return {
        "input_ids": input_tokens["input_ids"],
        "attention_mask": input_tokens["attention_mask"],
        "labels": target_tokens["input_ids"]
    }

# Apply preprocessing to the training and validation sets
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Filter overly short or long sentences
def filter_data(example):
    return 2 <= len(example["rm"]) <= 128 and 2 <= len(example["bn"]) <= 128

train_dataset = train_dataset.filter(filter_data)
val_dataset = val_dataset.filter(filter_data)

print(f"Training examples after filtering: {len(train_dataset)}")
print(f"Validation examples after filtering: {len(val_dataset)}")


Training examples after filtering: 3951
Validation examples after filtering: 985


Task 3. Select a Model

In [50]:
from transformers import AutoModelForSeq2SeqLM

# Load the pre-trained model
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


Task 4. Train the Model

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load the dataset from Hugging Face Hub
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets
split_ratio = 0.8
train_test_split = dataset["train"].train_test_split(test_size=1 - split_ratio, seed=42)

# Rename splits for clarity
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Load a pre-trained tokenizer (T5 in this example)
tokenizer_name = "t5-small"  # Replace with a model suitable for Bengali tasks if available
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Preprocessing function
def preprocess_data(example):
    # Tokenize Banglish input
    input_tokens = tokenizer(
        example["rm"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    # Tokenize Bangla target
    target_tokens = tokenizer(
        example["bn"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return {
        "input_ids": input_tokens["input_ids"],
        "attention_mask": input_tokens["attention_mask"],
        "labels": target_tokens["input_ids"]
    }

# Apply preprocessing to the training and validation sets
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Filter overly short or long sentences
def filter_data(example):
    return 5 <= len(example["rm"]) <= 128 and 5 <= len(example["bn"]) <= 128

train_dataset = train_dataset.filter(filter_data)
val_dataset = val_dataset.filter(filter_data)

# Load the pre-trained model
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load the sacrebleu metric using the evaluate library
metric = evaluate.load("sacrebleu")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Create copies of predictions and labels to avoid modifying the original arrays
    predictions = predictions.copy()
    labels = labels.copy()

    # Debugging: Check the shape and contents of predictions and labels
    print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
    print(f"Sample predictions (raw): {predictions[:5]}")
    print(f"Sample labels (raw): {labels[:5]}")

    # Replace invalid token IDs (e.g., values larger than the vocab size)
    # Ensure predictions and labels are within the valid vocabulary range
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)  # Clip to valid range
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1)  # Clip to valid range

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # SacreBLEU expects a list of list of references for each prediction
    decoded_labels = [[label] for label in decoded_labels]

    # Compute the sacrebleu score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,  # Ensure predictions are generated during evaluation
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    fp16=True,  # Enable mixed precision if supported by your GPU
)

# Use DataCollatorForSeq2Seq to handle padding correctly
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # From Task 1 and 2
    eval_dataset=val_dataset,  # From Task 1 and 2
    processing_class=tokenizer,  # From Task 2
    data_collator=data_collator,  # Properly handle padding
    compute_metrics=compute_metrics,  # BLEU metric computation
)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./results")
tokenizer.save_pretrained("./results")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Bleu
1,0.0283,0.027054,9.325731
2,0.0287,0.018985,26.759538
3,0.0232,0.016236,50.799913


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Predictions shape: (974, 21), Labels shape: (974, 128)
Sample predictions (raw): [[   0    3    2    3    2    3    2    3    2    3    2    3    2    5
     5    1    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    1    0    0    0    0
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    3    2    3    2   58
     1    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    1    0    0    0    0
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    3    2    3    2    1
     0    0    0    0    0    0 -100]]
Sample labels (raw): [[ 3  2  3  2  3  2  3  2  3  2  5  5  1  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Predictions shape: (974, 21), Labels shape: (974, 128)
Sample predictions (raw): [[ 0  3  2  3  2  3  2  3  2  3  2  3  2  5  5  1  0  0  0  0  0]
 [ 0  3  2  3  2  3  2  3  2  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  2  3  2  3  2  3  2 58  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  2  3  2  3  2  3  2  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  2  3  2  3  2  3  2  3  2  3  2  1  0  0  0  0  0  0  0]]
Sample labels (raw): [[ 3  2  3  2  3  2  3  2  3  2  5  5  1  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 3  2  3  2  3  2  3  2  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Predictions shape: (974, 21), Labels shape: (974, 128)
Sample predictions (raw): [[   0    3    2    3    2    3    2    3    2    3    2    5    5    1
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    1    0    0    0    0
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2   58    1    0    0    0
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    1    0    0    0    0
     0    0    0    0    0    0 -100]
 [   0    3    2    3    2    3    2    3    2    3    2    3    2    1
     0    0    0    0    0    0 -100]]
Sample labels (raw): [[ 3  2  3  2  3  2  3  2  3  2  5  5  1  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  