In [5]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [6]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoTokenizer, MBartForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.data import Dataset
import torch

In [7]:
# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

df = pd.DataFrame({key: dataset['train'][key] for key in dataset['train'].features})

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [8]:
# Filter the dataset
def filter_data(df, src_col, tgt_col, min_len=3, max_len=128):
    return df[(df[src_col].str.len() > min_len) & (df[src_col].str.len() < max_len) &
              (df[tgt_col].str.len() > min_len) & (df[tgt_col].str.len() < max_len)]

train_df = filter_data(train_df, "bn", "rm")
val_df = filter_data(val_df, "bn", "rm")

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "rm"

# Tokenize function
def tokenize_data(dataframe, tokenizer, src_col, tgt_col, max_length=128):
    src_texts = dataframe[src_col].tolist()
    tgt_texts = dataframe[tgt_col].tolist()
    tokenized_src = tokenizer(src_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    tokenized_tgt = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    return tokenized_src, tokenized_tgt

# Tokenize datasets
train_src, train_tgt = tokenize_data(train_df, tokenizer, "bn", "rm")
val_src, val_tgt = tokenize_data(val_df, tokenizer, "bn", "rm")

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [10]:
# Dataset class
class BanglishDataset(Dataset):
    def __init__(self, src_encodings, tgt_encodings):
        self.src_encodings = {key: torch.tensor(val) for key, val in src_encodings.items()}
        self.tgt_encodings = {key: torch.tensor(val) for key, val in tgt_encodings.items()}

    def __len__(self):
        return len(self.src_encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_encodings["input_ids"][idx],
            "attention_mask": self.src_encodings["attention_mask"][idx],
            "labels": self.tgt_encodings["input_ids"][idx],
        }

train_dataset = BanglishDataset(train_src, train_tgt)
val_dataset = BanglishDataset(val_src, val_tgt)

  self.src_encodings = {key: torch.tensor(val) for key, val in src_encodings.items()}
  self.tgt_encodings = {key: torch.tensor(val) for key, val in tgt_encodings.items()}


In [15]:
# Load model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")

tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "en_XX"

model.config.forced_bos_token_id = tokenizer.lang_code_to_id["en_XX"]

# Training arguments with optimizations
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    logging_steps=500,
    fp16=True,
    gradient_accumulation_steps=2,
    dataloader_num_workers=4,
    load_best_model_at_end=True,
)

# Custom data collator
def data_collator(features):
    input_ids = torch.stack([f["input_ids"] for f in features])
    attention_mask = torch.stack([f["attention_mask"] for f in features])
    labels = torch.stack([f["labels"] for f in features])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

torch.cuda.empty_cache()

trainer.evaluate()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,0.115237
2,1.561900,0.101411
3,0.144500,0.102991
4,0.039700,0.120615


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 0.10141053795814514,
 'eval_runtime': 19.7498,
 'eval_samples_per_second': 49.975,
 'eval_steps_per_second': 12.506,
 'epoch': 4.990825688073395}

In [24]:
# Set source and target languages
tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "en_XX"

# Test on a sample
sample = ["আমার সোনার বাংলা", "তুমি কোথায় থাকো"]

# Tokenize sample
tokenized_sample = tokenizer(
    sample,
    return_tensors="pt",
    padding=True,
    truncation=True,
)

print("Tokenized Input:", tokenized_sample)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
tokenized_sample = {key: val.to(device) for key, val in tokenized_sample.items()}

# Generate translations
generated_tokens = model.generate(
    **tokenized_sample,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],  # Set the correct target language
    max_length=50,
)

decoded_output = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_tokens]

# Print results
print("Bangla Input:", sample)
print("English Output:", decoded_output)

Tokenized Input: {'input_ids': tensor([[ 61320,  29388, 210749,    999,  38732,      2],
        [ 61320, 122493, 230774, 116578,   9445,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])}
Bangla Input: ['আমার সোনার বাংলা', 'তুমি কোথায় থাকো']
English Output: ['bn Amar gold Bangla', 'bn tuntu kothay tosho']
