In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm


dataset = load_dataset("Aarif1430/english-to-hindi")
train_data = dataset["train"]

def tokenize(sentence):
    return sentence.lower().strip().split()

def build_vocab(sentences, max_vocab_size=10000):
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    word_freq = {}
    for sent in sentences:
        for word in tokenize(sent):
            word_freq[word] = word_freq.get(word, 0) + 1
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    for word, _ in sorted_words[:max_vocab_size - len(vocab)]:
        vocab[word] = len(vocab)
    return vocab

eng_sentences = [item['english_sentence'] for item in train_data]
hi_sentences  = [item['hindi_sentence'] for item in train_data]
eng_vocab = build_vocab(eng_sentences)
hi_vocab  = build_vocab(hi_sentences)


MAX_LEN = 30

def encode_sentence(sentence, vocab):
    tokens = ["<sos>"] + tokenize(sentence)[:MAX_LEN-2] + ["<eos>"]
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

def pad_tensor(tokens, max_len=MAX_LEN):
    padded = tokens + [0] * (max_len - len(tokens))
    return torch.tensor(padded[:max_len])

src_data = [pad_tensor(encode_sentence(s, eng_vocab)) for s in eng_sentences[:100000]]
trg_data = [pad_tensor(encode_sentence(s, hi_vocab)) for s in hi_sentences[:100000]]
src_tensor = torch.stack(src_data)
trg_tensor = torch.stack(trg_data)
dataset = TensorDataset(src_tensor, trg_tensor)


class Seq2Seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, emb_dim=256, hidden_dim=512, num_layers=1):
        super(Seq2Seq, self).__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, emb_dim)
        self.encoder_lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder_embedding = nn.Embedding(output_vocab_size, emb_dim)
        self.decoder_lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_vocab_size)

    def forward(self, src, trg):
        embedded_src = self.encoder_embedding(src)
        _, (hidden, cell) = self.encoder_lstm(embedded_src)
        embedded_trg = self.decoder_embedding(trg)
        output, _ = self.decoder_lstm(embedded_trg, (hidden, cell))
        predictions = self.fc_out(output)
        return predictions


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(len(eng_vocab), len(hi_vocab)).to(device)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.95)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

epochs = 5
total_steps = epochs * len(train_loader)
data_iter = iter(train_loader)

pbar = tqdm(range(1, total_steps + 1))

for step in pbar:
    try:
        src_batch, trg_batch = next(data_iter)
    except StopIteration:
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for val_src, val_trg in val_loader:
                val_src, val_trg = val_src.to(device), val_trg.to(device)
                val_output = model(val_src, val_trg[:, :-1])
                loss = loss_fn(val_output.reshape(-1, val_output.shape[-1]), val_trg[:, 1:].reshape(-1))
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        current_epoch = (step - 1) // len(train_loader) + 1
        print(f"\n Epoch {current_epoch} validation loss: {avg_val_loss:.4f}")
        torch.save(model.state_dict(), f"model_epoch{current_epoch}.pt")
        print(f" Model saved as model_epoch{current_epoch}.pt\n")
        model.train()
        data_iter = iter(train_loader)
        src_batch, trg_batch = next(data_iter)

    src_batch, trg_batch = src_batch.to(device), trg_batch.to(device)
    optimizer.zero_grad()
    output = model(src_batch, trg_batch[:, :-1])
    loss = loss_fn(output.reshape(-1, output.shape[-1]), trg_batch[:, 1:].reshape(-1))
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    current_epoch = (step - 1) // len(train_loader) + 1
    pbar.set_postfix({'loss': f'{loss.item():.4f}', 'epoch': current_epoch})

 20%|█▉        | 2811/14065 [01:09<04:22, 42.82it/s, loss=4.5258, epoch=2]


 Epoch 2 validation loss: 4.5513
 Model saved as model_epoch2.pt



 40%|████      | 5626/14065 [02:19<03:19, 42.21it/s, loss=3.6555, epoch=3]


 Epoch 3 validation loss: 4.1141
 Model saved as model_epoch3.pt



 60%|██████    | 8441/14065 [03:29<15:56,  5.88it/s, loss=2.8080, epoch=4]


 Epoch 4 validation loss: 3.8948
 Model saved as model_epoch4.pt



 80%|███████▉  | 11251/14065 [04:38<01:06, 42.30it/s, loss=3.1244, epoch=5]


 Epoch 5 validation loss: 3.7765
 Model saved as model_epoch5.pt



100%|██████████| 14065/14065 [05:45<00:00, 40.68it/s, loss=2.7607, epoch=5]


In [32]:
model_path = "model_epoch5.pt"  
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()


id2word_hi = {v: k for k, v in hi_vocab.items()}


def translate_sentence(model, sentence, eng_vocab, hi_vocab, max_len=30):
    model.eval()
    with torch.no_grad():
        src = pad_tensor(encode_sentence(sentence, eng_vocab)).unsqueeze(0).to(device)
        trg = torch.tensor([[hi_vocab["<sos>"]]]).to(device)
        translated = []

        for _ in range(max_len):
            output = model(src, trg)
            next_token = output[0, -1].argmax().item()

            if next_token == hi_vocab["<eos>"]:
                break

            translated.append(next_token)
            trg = torch.cat([trg, torch.tensor([[next_token]]).to(device)], dim=1)

        return " ".join([id2word_hi.get(idx, "<unk>") for idx in translated])


test_sentence = "What is your name?"
translation = translate_sentence(model, test_sentence, eng_vocab, hi_vocab)
print("EN:", test_sentence)
print("HI:", translation)

EN: What is your name?
HI: क्या आप क्या कर सकते हैं ?
