In [None]:
!pip install transformers



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# prompt: read hin_train.json and convert it to a dataframe by iterating over all rows

import pandas as pd
import json

def read_json_to_dataframe(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {e}")
                # You might want to handle the error differently, e.g., skip the line
                continue
    return pd.DataFrame(data)

# Example usage (assuming 'hin_train.json' is in the current directory)
try:
    train_df = read_json_to_dataframe('hin_train.json')
    print(train_df.head())  # Print the first few rows of the DataFrame
except FileNotFoundError:
    print("Error: hin_train.json not found. Please ensure the file exists in the current directory.")



In [None]:
# prompt: read hin_train.json and convert it to a dataframe by iterating over all rows

import pandas as pd
import json

def read_json_to_dataframe(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {e}")
                # You might want to handle the error differently, e.g., skip the line
                continue
    return pd.DataFrame(data)

# Example usage (assuming 'hin_train.json' is in the current directory)
try:
    test_df = read_json_to_dataframe('hin_test.json')
    print(test_df.head())  # Print the first few rows of the DataFrame
except FileNotFoundError:
    print("Error: hin_train.json not found. Please ensure the file exists in the current directory.")



In [None]:
train_df = train_df.rename(columns={'native word': 'devnagari', 'english word': 'romanized'})
print(train_df.head())  # Print the first few rows of the DataFrame

In [None]:
test_df = test_df.rename(columns={'native word': 'devnagari', 'english word': 'romanized'})
print(test_df.head())  # Print the first few rows of the DataFrame

In [None]:
from transformers import AutoTokenizer

# Load the mT5 tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["romanized"],
        max_length=128,  # Set a maximum length for input sequences
        padding="max_length",  # Pad shorter sequences to the maximum length
        truncation=True,  # Truncate longer sequences to the maximum length
    )
    # If you have labels, apply the same preprocessing to them
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["devnagari"],
            max_length=64,  # Set a maximum length for labels
            padding="max_length",
            truncation=True,
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from datasets import Dataset

# Convert train and test DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
print(train_dataset[0])

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-finetuned",  # Directory to save the model
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
)

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer

# Load the mT5 model
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./mt5-transliteration")
tokenizer.save_pretrained("./mt5-transliteration")