In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd

In [2]:
import zipfile

In [3]:
zip_file_path = '/content/Dataset_English_Hindi.csv.zip'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [4]:
data = "/content/Dataset_English_Hindi.csv"
df = pd.read_csv(data)

In [5]:
df_list = df.values.tolist()

In [6]:
df_list = df_list[:5000]

In [7]:
df_list[:10]

[['Help!', 'बचाओ!'],
 ['Jump.', 'उछलो.'],
 ['Jump.', 'कूदो.'],
 ['Jump.', 'छलांग.'],
 ['Hello!', 'नमस्ते।'],
 ['Hello!', 'नमस्कार।'],
 ['Cheers!', 'वाह-वाह!'],
 ['Cheers!', 'चियर्स!'],
 ['Got it?', 'समझे कि नहीं?'],
 ["I'm OK.", 'मैं ठीक हूँ।']]

In [8]:
import re

def preprocess_text(text):
  text = text.lower().strip()
  text = re.sub(r"[^\w\s\u0900-\u097f]", "", text)
  return text

In [9]:
def tokenize(text):
  return preprocess_text(text).split()

In [10]:
def build_vocab(sentences):
  vocab = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2, "<UNK>":3}
  for sentence in sentences:
    for word in tokenize(sentence):
      if word not in vocab:
        vocab[word] = len(vocab)
  return vocab

In [11]:
def sentence_to_tensor(sentence, vocab):
    tokens = ['<BOS>'] + tokenize(sentence) + ['<EOS>']
    return torch.tensor([vocab.get(token, vocab['<UNK>']) for token in tokens])

In [12]:
src_sentences = [str(list[0]) if pd.notna(list[0]) else "" for list in df_list]
trg_sentences = [str(list[1]) if pd.notna(list[1]) else "" for list in df_list]

In [13]:
src_vocab = build_vocab(src_sentences)
trg_vocab = build_vocab(trg_sentences)

In [14]:
len(src_vocab), len(trg_vocab)

(8416, 10337)

In [15]:
inv_trg_vocab = {idx: word for word, idx in trg_vocab.items()}

In [16]:
data = [(sentence_to_tensor(str(src), src_vocab), sentence_to_tensor(str(trg), trg_vocab)) for src, trg in df_list]

In [17]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn(batch):
  src_batch, trg_batch = zip(*batch)
  src_batch = pad_sequence(src_batch, batch_first = True, padding_value=src_vocab["<PAD>"])
  trg_batch = pad_sequence(trg_batch, batch_first = True, padding_value=trg_vocab["<PAD>"])
  return src_batch, trg_batch

In [18]:
loader = DataLoader(data, batch_size = 16, shuffle = True, collate_fn = collate_fn)

In [19]:
import torch.nn as nn

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.GRU(emb_dim, hid_dim)

  def forward(self, src):
    embedded = self.embedding(src)
    batch_size = src.size(0)
    hidden = torch.zeros(1, batch_size, self.rnn.hidden_size).to(src.device)
    outputs, hidden = self.rnn(embedded)
    return hidden

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim):
    super().__init__()
    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
    self.fc = nn.Linear(hid_dim, output_dim)

  def forward(self, input, hidden):
    # input is (batch_size)
    batch_size = input.size(0) # Get the current batch size
    input = input.unsqueeze(1) # input is now (batch_size, 1)
    embedded = self.embedding(input) # embedded is (batch_size, 1, emb_dim)

    # Ensure the hidden state has the correct batch size
    # If hidden is None or its batch size doesn't match, initialize it
    if hidden is None or hidden.size(1) != batch_size:
        hidden = torch.zeros(1, batch_size, self.rnn.hidden_size).to(input.device)

    output, hidden = self.rnn(embedded, hidden) # output is (batch_size, 1, hid_dim), hidden is (1, batch_size, hid_dim)

    # Squeeze the sequence length dimension (1) before the linear layer
    prediction = self.fc(output.squeeze(1)) # prediction is (batch_size, output_dim)

    return prediction, hidden

class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg, teacher_forcing_ratio = 0.5):
    batch_size = trg.shape[0] # Get batch size from trg shape
    trg_len = trg.shape[1] # Get sequence length from trg shape
    trg_vocab_size = len(trg_vocab)
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    hidden = self.encoder(src)

    # Use the first target token (<bos>) as the initial input to the decoder
    input = trg[:, 0] # Get the first token for all batches

    for t in range(1, trg_len):
      # Pass the hidden state obtained from the previous decoder step
      output, hidden = self.decoder(input, hidden)
      outputs[t] = output
      top1 = output.argmax(1)
      input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

    # Permute outputs to be (batch_size, trg_len, trg_vocab_size)
    outputs = outputs.permute(1, 0, 2)

    return outputs

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
device

device(type='cuda')

In [22]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)
EMB_DIM = 32
HID_DIM = 64

In [23]:
INPUT_DIM, OUTPUT_DIM

(8416, 10337)

In [24]:
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM)
model = Seq2Seq(enc, dec, device).to(device)

In [25]:
!pip install torchinfo



In [26]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                   Param #
Seq2Seq                                  --
├─Encoder: 1-1                           --
│    └─Embedding: 2-1                    269,312
│    └─GRU: 2-2                          18,816
├─Decoder: 1-2                           --
│    └─Embedding: 2-3                    330,784
│    └─GRU: 2-4                          18,816
│    └─Linear: 2-5                       671,905
Total params: 1,309,633
Trainable params: 1,309,633
Non-trainable params: 0

In [27]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=trg_vocab['<PAD>'])

In [28]:
from tqdm.notebook import tqdm
for epoch in range(100):
  model.train()
  total_loss = 0
  for src, trg in tqdm(loader):
    src, trg = src.to(device), trg.to(device)
    optimizer.zero_grad()
    output = model(src, trg)
    output = output[:, 1:, :].reshape(-1, OUTPUT_DIM)
    trg = trg[:, 1:].reshape(-1)
    loss = criterion(output, trg)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch 100 | Loss: 1355.4427


In [29]:
def translate(model, sentence):
  model.eval()
  src_tensor = sentence_to_tensor(sentence, src_vocab).unsqueeze(1).to(device)
  hidden = model.encoder(src_tensor)
  input = torch.tensor([trg_vocab['<BOS>']], device = device)
  result = []

  for _ in range(20):
    output, hidden = model.decoder(input, hidden)
    top1 = output.argmax(1)
    word = inv_trg_vocab[top1.item()]
    if word == "<EOS>":
      break
    result.append(word)
    input = top1

  return " ".join(result)

In [33]:
print(translate(model, "help"))

मैं के में


In [34]:
print(translate(model, "I'm tired now."))

मुझे नहीं पता कि वह की ज़रूरत है।


In [39]:
print(translate(model, "These questions are easy to answer."))

मुझे नहीं पता कि वह की ज़रूरत है।
