In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Load Dataset

In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Convert to list format while preserving all columns
data_dict = {
    'bengali': dataset['train']['bn'],
    'romanized': dataset['train']['rm']
}

# Create indices for splitting
indices = list(range(len(dataset['train'])))

# Split the indices
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.1,
    random_state=42
)

# Create the split datasets using the indices
from datasets import Dataset

train_dataset = Dataset.from_dict({
    'bengali': [data_dict['bengali'][i] for i in train_indices],
    'romanized': [data_dict['romanized'][i] for i in train_indices]
})

val_dataset = Dataset.from_dict({
    'bengali': [data_dict['bengali'][i] for i in val_indices],
    'romanized': [data_dict['romanized'][i] for i in val_indices]
})

# Print dataset statistics
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Print a few examples to verify the data
print("\nFirst few training examples:")
for i in range(3):
    print(f"Bengali: {train_dataset[i]['bengali']}")
    print(f"Romanized: {train_dataset[i]['romanized']}")
    print()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Training samples: 4505
Validation samples: 501

First few training examples:
Bengali: ২ মিনিট এর একাউন্ট ব্লক করে দিছে…..পোস্ট ডিলিট করে দিন
Romanized: 2 minute ar account block kore dice…..post delete kore din

Bengali: ভয় কে জয় করুন 
Romanized: Voy ke joy korun

Bengali: আপনার ফোনের নেট স্পিড অ্যাপ এর নাম টা কি আর অ্যাপ্স টা কি লিংক দিতে পারবেন কি 
Romanized: apnar phoner net speed app er nam ta ki ar apps ta ki link dite parben ki



Preprocess and train

In [13]:
from huggingface_hub import create_repo
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from datasets import Dataset
import re
import os

class BanglishBanglaDataProcessor:
    def __init__(self, max_length=64):  # Reduced max_length for smaller token size
        model_name = "t5-small"  # Using an even smaller model than flan-t5-small
        self.tokenizer = T5Tokenizer.from_pretrained(model_name, cache_dir=os.getenv("TRANSFORMERS_CACHE", "./cache"))
        self.max_length = max_length

        # Add special tokens for Banglish and Bangla
        special_tokens = {"additional_special_tokens": ["<banglish>", "<bangla>"]}
        self.tokenizer.add_special_tokens(special_tokens)

    def clean_text(self, text):
        # Basic text cleaning
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.strip()
        return text

    def filter_examples(self, example):
        # Filter out examples that are too short or too long
        return (len(example['bengali']) >= 2 and
                len(example['romanized']) >= 2 and
                len(example['bengali']) <= self.max_length and
                len(example['romanized']) <= self.max_length)

    def preprocess_function(self, examples):
        # Prepare input and target texts
        banglish_texts = [self.clean_text(text) for text in examples['romanized']]
        bangla_texts = [self.clean_text(text) for text in examples['bengali']]

        # Tokenize inputs
        model_inputs = self.tokenizer(
            banglish_texts,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize targets
        labels = self.tokenizer(
            bangla_texts,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

def setup_training_pipeline(train_dataset, val_dataset, processor):
    # Model initialization with smaller model size
    model = T5ForConditionalGeneration.from_pretrained("t5-small", cache_dir=os.getenv("TRANSFORMERS_CACHE", "./cache"))
    model.resize_token_embeddings(len(processor.tokenizer))

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # Process datasets
    train_dataset = train_dataset.map(
        processor.preprocess_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    val_dataset = val_dataset.map(
        processor.preprocess_function,
        batched=True,
        remove_columns=val_dataset.column_names
    )

    # Training arguments with smaller batch size and gradient accumulation
    training_args = Seq2SeqTrainingArguments(
        output_dir="./banglish-bangla-translator",
        evaluation_strategy="steps",
        eval_steps=100,
        learning_rate=3e-5,
        per_device_train_batch_size=4,  # Smaller batch size to avoid OOM errors
        per_device_eval_batch_size=4,   # Smaller batch size for evaluation
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=True,  # Enable mixed precision
        logging_steps=100,
        save_steps=100,
        gradient_accumulation_steps=4,  # Gradient accumulation to simulate larger batch size
        no_cuda=False,  # Set this to True if you want to force CPU training
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        processor.tokenizer,
        model=model,
        padding=True
    )

    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=processor.tokenizer,
        data_collator=data_collator,
    )

    return trainer

# Usage example

    # Initialize processor
processor = BanglishBanglaDataProcessor()

    # Assuming train_dataset and val_dataset are already loaded
    # Filter datasets
train_dataset = train_dataset.filter(processor.filter_examples)
val_dataset = val_dataset.filter(processor.filter_examples)

    # Setup and start training
trainer = setup_training_pipeline(train_dataset, val_dataset, processor)
trainer.train()
model_save_dir = "./banglish-bangla-translator"
trainer.model.save_pretrained(model_save_dir)
processor.tokenizer.save_pretrained(model_save_dir)
print(f"Model and tokenizer saved to {model_save_dir}")

    # Push to Hugging Face Hub
repo_name = "banglish-bangla-translator"



Filter:   0%|          | 0/4028 [00:00<?, ? examples/s]

Filter:   0%|          | 0/452 [00:00<?, ? examples/s]

Map:   0%|          | 0/4028 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
100,16.224,0.339762
200,1.467,0.097699
300,0.4565,0.058427
400,0.2942,0.051716
500,0.275,0.043942
600,0.2422,0.046107
700,0.2364,0.042652


Model and tokenizer saved to ./banglish-bangla-translator


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6766e37f-26eaa6de28e4d5c736b4557a;62beb1de-d7df-45b9-843c-399d1254297d)

Invalid username or password.

In [16]:
from huggingface_hub import create_repo
repo_name = "banglish-bangla-translator"

    # Create a new repo on Hugging Face (if not already created)
create_repo(repo_name, exist_ok=True)

    # Push the model and tokenizer to Hugging Face
trainer.model.push_to_hub(repo_name)
processor.tokenizer.push_to_hub(repo_name)

print(f"Model and tokenizer uploaded to Hugging Face Hub at {repo_name}.")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6766e4cd-46d445e901c1206f1471b106;fbfd6483-b39e-4a4e-9fe0-a501933b422c)

Invalid username or password.

In [None]:
When selecting hyperparameters such as learning rate, batch size, and the number of epochs for training a model, several factors influence the choice to ensure a good balance between training efficiency, model performance, and resource constraints. Here's the justification for each of the choices you've made for the Banglish-Bangla translation model:

1. Learning Rate (3e-5)
Justification:

Lower learning rates are typically preferred for fine-tuning pre-trained models like T5 because it helps prevent drastic updates to the pre-trained weights, which could destabilize the learning process. A learning rate of 3e-5 is commonly used when fine-tuning transformer models and has been shown to work well for tasks like translation and text generation.
Stability and convergence: Fine-tuning with a learning rate that is too high could lead to overshooting the optimal solution, while a very low learning rate might make training unnecessarily slow. A learning rate of 3e-5 strikes a good balance by providing stable convergence without being too slow.
2. Batch Size (4)
Justification:

Memory constraints: Batch size is typically constrained by the GPU memory. Since you're using a smaller model (t5-small), it allows you to use a smaller batch size while still fitting within the available GPU memory. A batch size of 4 is a practical choice for preventing out-of-memory (OOM) errors, especially when training on high-dimensional data or when using gradient accumulation.
Generalization: Smaller batch sizes are often known to help with better generalization. Larger batch sizes can sometimes lead to overfitting because the model can see a larger portion of the data in each iteration, which may reduce the model's ability to generalize well.
Training stability: A smaller batch size also makes it easier to manage noisy gradients, helping to stabilize training. For models like T5, batch sizes in the range of 4 to 8 are quite common, depending on available resources.
3. Gradient Accumulation (4)
Justification:

Simulate larger batch size: To simulate a larger batch size without running into memory issues, gradient accumulation is used. Here, gradients are accumulated over four mini-batches before performing a backward pass. This gives the effect of a larger batch size, but without needing to store all the data in memory at once.
Resource efficiency: This method ensures that the model can still benefit from the advantages of large batch training, such as improved gradient estimates, while adhering to the memory limitations imposed by smaller batch sizes. It's particularly useful for large-scale models like T5.
4. Number of Epochs (3)
Justification:

Overfitting prevention: Training for 3 epochs strikes a balance between underfitting and overfitting. Pre-trained models like t5-small generally require fewer epochs because they already have a substantial understanding of the language due to pre-training on large corpora. Fine-tuning them for 3 epochs typically results in good generalization without overfitting.
Sufficient for convergence: The pre-trained model already has strong knowledge of grammar and sentence structures, so fine-tuning it for just a few epochs is usually sufficient for adapting it to the specific translation task (in this case, Banglish-Bangla). More epochs might lead to overfitting, especially with a relatively small dataset.
5. Evaluation Strategy (steps) and eval_steps = 500
Justification:

Evaluation frequency: Evaluating every 500 steps allows you to monitor the model's progress and performance during training without too much overhead. This helps in detecting potential issues early in the training process.
Early stopping: Frequent evaluations ensure that you can implement early stopping if the model performance stops improving, preventing unnecessary training and overfitting.
6. Mixed Precision Training (fp16=True)
Justification:

Memory efficiency: Using mixed precision training (fp16) reduces the memory footprint and accelerates training by using half-precision floating-point numbers where possible. This allows you to train larger models or use larger batch sizes on GPUs without running out of memory.
Speedup: Mixed precision also enables faster computations on supported hardware, such as NVIDIA GPUs with Tensor Cores, which accelerates training without compromising model quality.
7. Weight Decay (0.01)
Justification:

Preventing overfitting: Weight decay regularizes the model by penalizing large weights, helping prevent overfitting, especially in deep models like T5. A weight decay of 0.01 is commonly used for fine-tuning transformer-based models, offering a good tradeoff between regularization and model flexibility.
8. Evaluation Strategy (steps) and save_steps = 500
Justification:

Saving the model frequently: Saving the model every 500 steps ensures that if training is interrupted, you don't lose too much progress. It also allows you to evaluate the model's performance on validation data at regular intervals, which can be helpful for monitoring improvements or signs of overfitting.