In [None]:

from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch
from transformers import AdamW
    

In [None]:

# 1. Load the Dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")
data = dataset['train'].to_pandas()

# Split the dataset into training and validation subsets
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)
    

In [None]:

# 2. Data Preprocessing
def preprocess_data(data, tokenizer, max_length):
    inputs = tokenizer(data['rm'].tolist(), max_length=max_length, padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(data['bn'].tolist(), max_length=max_length, padding=True, truncation=True, return_tensors="pt")
    return inputs, targets

# Initialize tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Preprocess data
max_length = 128
train_inputs, train_targets = preprocess_data(data_train, tokenizer, max_length)
val_inputs, val_targets = preprocess_data(data_val, tokenizer, max_length)

# Create Dataloaders
batch_size = 16
train_dataset = torch.utils.data.TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_targets['input_ids'])
val_dataset = torch.utils.data.TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_targets['input_ids'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
    

In [None]:

# 3. Select a Model
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
    

In [None]:

# 4. Train the Model
def train_model(model, train_loader, val_loader, epochs, lr):
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, target_ids = [b.to(device) for b in batch]
            
            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        print(f"Epoch {epoch+1}, Training Loss: {train_loss/len(train_loader)}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, target_ids = [b.to(device) for b in batch]

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids)
                val_loss += outputs.loss.item()

        print(f"Epoch {epoch+1}, Validation Loss: {val_loss/len(val_loader)}")

# Hyperparameters
epochs = 3
lr = 5e-5

# Train the model
train_model(model, train_loader, val_loader, epochs, lr)
    