In [None]:
# Install necessary libraries
!pip install datasets transformers

from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets (90/10 split)
dataset = dataset["train"].train_test_split(test_size=0.1)

# Check dataset structure
print("Training examples:", len(dataset["train"]))
print("Validation examples:", len(dataset["test"]))


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Training examples: 4505
Validation examples: 501


In [None]:
# Print the column names
print("Columns:", dataset["train"].column_names)

# Print the first row of data for the train dataset
print("First row:", {col: dataset["train"][col][0] for col in dataset["train"].column_names[:5]})


Columns: ['bn', 'rm']
First row: {'bn': 'আমার ভাই ৬+১২৮ জিবি। ১৪ হাজার ৮০০ টাকা দিয়ে কিনছি গত কাল', 'rm': 'amr vai 6+128 gb. 14 hajar 800 tk diye kince goto kal'}


In [None]:
# Print all column names in the train dataset
print("All columns:", dataset["train"].column_names)


All columns: ['bn', 'rm']


In [None]:
from transformers import AutoTokenizer

# Load tokenizer for the selected model (e.g., T5)
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = examples["rm"]
    targets = examples["bn"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

In [None]:
# Print a few samples before tokenization
print("Before Tokenization:")
print(dataset["train"][0])  # First sample of the training dataset

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Print a few samples after tokenization
print("\nAfter Tokenization:")
print(tokenized_dataset["train"][0])  # First sample of the tokenized training dataset


Before Tokenization:
{'bn': 'আমার ভাই ৬+১২৮ জিবি। ১৪ হাজার ৮০০ টাকা দিয়ে কিনছি গত কাল', 'rm': 'amr vai 6+128 gb. 14 hajar 800 tk diye kince goto kal'}


Map:   0%|          | 0/4505 [00:00<?, ? examples/s]


After Tokenization:
{'bn': 'আমার ভাই ৬+১২৮ জিবি। ১৪ হাজার ৮০০ টাকা দিয়ে কিনছি গত কাল', 'rm': 'amr vai 6+128 gb. 14 hajar 800 tk diye kince goto kal', 'input_ids': [728, 286, 2480, 259, 132593, 16420, 10105, 260, 818, 560, 3879, 6209, 259, 270, 314, 17001, 504, 16106, 259, 117000, 7757, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Filter examples with overly short or long inputs/outputs
def filter_function(example):
    return 5 <= len(example["rm"]) <= 100 and 5 <= len(example["bn"]) <= 100

filtered_dataset = tokenized_dataset.filter(filter_function)


Filter:   0%|          | 0/4505 [00:00<?, ? examples/s]

Filter:   0%|          | 0/501 [00:00<?, ? examples/s]

In [None]:
# Check the size of the dataset before filtering
print("Before Filtering:")
print(f"Training examples: {len(tokenized_dataset['train'])}")
print(f"Validation examples: {len(tokenized_dataset['test'])}")
print(f"First example: {tokenized_dataset['train'][0]}")

# Apply the filter
filtered_dataset = tokenized_dataset.filter(filter_function)

# Check the size of the dataset after filtering
print("\nAfter Filtering:")
print(f"Training examples: {len(filtered_dataset['train'])}")
print(f"Validation examples: {len(filtered_dataset['test'])}")
print(f"First example: {filtered_dataset['train'][0]}")


Before Filtering:
Training examples: 4505
Validation examples: 501
First example: {'bn': 'আমার ভাই ৬+১২৮ জিবি। ১৪ হাজার ৮০০ টাকা দিয়ে কিনছি গত কাল', 'rm': 'amr vai 6+128 gb. 14 hajar 800 tk diye kince goto kal', 'input_ids': [728, 286, 2480, 259, 132593, 16420, 10105, 260, 818, 560, 3879, 6209, 259, 270, 314, 17001, 504, 16106, 259, 117000, 7757, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import MT5ForConditionalGeneration

# Load the mT5 model
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

task-4

In [None]:
import os
import torch
from transformers import MT5ForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Suppress tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Use MPS for Metal on Mac M1 or fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")
dataset = dataset["train"].train_test_split(test_size=0.1)

# Subsample the dataset to reduce size (limit to 100 examples for training and 20 for validation)
train_subset = dataset["train"].select(range(30))  # Take the first 100 examples
test_subset = dataset["test"].select(range(10))     # Take the first 20 examples

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(device)

# Preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["rm"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["bn"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_subset.map(preprocess_function, batched=True)
tokenized_test = test_subset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=5e-5,             # Standard learning rate for fine-tuning
    per_device_train_batch_size=4,  # Smaller batch size for M1 memory constraints
    per_device_eval_batch_size=4,
    num_train_epochs=1,             # Reduced epochs for faster training
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=False,                     # Mixed precision not supported on MPS
    optim="adamw_torch",
    report_to="none"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./banglish-to-bengali-model")
tokenizer.save_pretrained("./banglish-to-bengali-model")
print("Model training complete and saved.")


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,57.353191


Model training complete and saved.


In [None]:
pip install huggingface_hub




In [None]:
!pip install huggingface_hub




In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load your saved model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("./banglish-to-bengali-model")
tokenizer = AutoTokenizer.from_pretrained("./banglish-to-bengali-model")

# Push to Hugging Face Hub
model.push_to_hub("rizon1326/banglish-to-bengali-model")
tokenizer.push_to_hub("rizon1326/banglish-to-bengali-model")


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rizon1326/banglish-to-bengali-model/commit/46c652deb50971cef9becb5062adbfb2fe929fe6', commit_message='Upload tokenizer', commit_description='', oid='46c652deb50971cef9becb5062adbfb2fe929fe6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rizon1326/banglish-to-bengali-model', endpoint='https://huggingface.co', repo_type='model', repo_id='rizon1326/banglish-to-bengali-model'), pr_revision=None, pr_num=None)

In [None]:
!zip -r banglish-to-bengali-model.zip ./banglish-to-bengali-model


  adding: banglish-to-bengali-model/ (stored 0%)
  adding: banglish-to-bengali-model/tokenizer_config.json (deflated 95%)
  adding: banglish-to-bengali-model/tokenizer.json (deflated 76%)
  adding: banglish-to-bengali-model/model.safetensors (deflated 38%)
  adding: banglish-to-bengali-model/generation_config.json (deflated 29%)
  adding: banglish-to-bengali-model/config.json (deflated 47%)
  adding: banglish-to-bengali-model/spiece.model (deflated 46%)
  adding: banglish-to-bengali-model/special_tokens_map.json (deflated 73%)
