#Translator using seq2seq network


In [None]:
from __future__ import unicode_literals,print_function,division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
import  torch.nn.functional as F
from torch import optim

import numpy as np
from torch.utils.data import TensorDataset,DataLoader,RandomSampler
device=torch.device("cuda" if torch.cuda.is_available()else "cpu")

#loading data files



In [None]:
SOS_token=0
EOS_token=1

class Lang:
  def __init__(self,name):
    self.name=name
    self.word2index={}
    self.word2count={}
    self.index2word={0:"SOS",1:"EOS"}
    self.n_words = 2 # Count SOS and EOS

  def addsentence(self,sentence):
    for word in sentence.split(' '):
      self.addword(word)

  def addword(self,word):
    if word not in self.word2index:
      self.word2index[word]=self.n_words
      self.word2count[word]=1
      self.index2word[self.n_words]=word
      self.n_words+=1
    else:
      self.word2count[word]+=1

The files are all in Unicode, to simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation.

In [None]:
# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c)!='Mn'
    )
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s=unicodeToAscii(s.lower().strip())
    # Replace common French contractions with space before the letter
    s = re.sub(r"l'", r"l '", s)
    s = re.sub(r"n'", r"n '", s)
    s=re.sub(r"([.!?])",r" \1",s)
    s=re.sub(r"[^a-zA-Z.!? ]+",r"",s)
    return s.strip()

In [None]:
import os

def readlangs(lang1,lang2,reverse=False):
  print("reading lines...")
#read the file and split into lines
  filepath = '/content/data_extracted/data/eng-fra.txt'
  if not os.path.exists(filepath):
      print(f"Error: File not found at {filepath}")
      return None, None, []
  if not os.path.isfile(filepath):
      print(f"Error: Path is a directory, not a file: {filepath}")
      return None, None, []

  lines=open(filepath,encoding='utf-8').\
read().strip().split('\n')

  pairs=[[normalizeString(s)for s in l.split('\t')]for l in lines]



  if reverse:
    pairs=[list(reversed(p))for p in pairs]
    input_lang=Lang(lang2)
    output_lang=Lang(lang1)
  else:
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

  return input_lang, output_lang, pairs

Since there are a lot of example sentences and we want to train something quickly, we'll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we're filtering to sentences that translate to the form "I am" or "He is" etc. (accounting for apostrophes replaced earlier).

In [None]:
MAX_LENGTH=20

eng_prefixes=(
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterpair(p):
  return len(p[0].split(' '))<MAX_LENGTH and \
  len(p[1].split(' '))<MAX_LENGTH


def filterpairs(pairs):
    return [pair for pair in pairs if filterpair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readlangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print(f"Pairs before filtering: {pairs[:5]}") # Add this line to inspect pairs before filtering
    pairs = filterpairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print(f"Pairs before adding sentences: {pairs[:5]}")
    print("Counting words...")
    for pair in pairs:
        input_lang.addsentence(pair[0])
        output_lang.addsentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

reading lines...
Read 135842 sentence pairs
Pairs before filtering: [['va  !', 'go .'], ['cours !', 'run !'], ['courez !', 'run !'], ['ca alors !', 'wow !'], ['au feu  !', 'fire !']]
Trimmed to 135342 sentence pairs
Pairs before adding sentences: [['va  !', 'go .'], ['cours !', 'run !'], ['courez !', 'run !'], ['ca alors !', 'wow !'], ['au feu  !', 'fire !']]
Counting words...
Counted words:
fra 25861
eng 13334
['quelle est ta date de naissance ?', 'whats your date of birth ?']


# Defining the Models
Now that the data is prepared, we can define our Encoder and Decoder models. We will use the `EncoderRNN` and `AttenDecoder` classes defined earlier.

In [None]:
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words, hidden_size, dropout_p=0.1).to(device)
decoder = AttenDecoder(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

print(encoder)
print(decoder)

EncoderRNN(
  (embedding): Embedding(25861, 256, padding_idx=0)
  (gru): GRU(256, 256, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
AttenDecoder(
  (embedding): Embedding(13334, 256, padding_idx=0)
  (attention): BahdanauAttention(
    (Wa): Linear(in_features=256, out_features=256, bias=True)
    (Ua): Linear(in_features=256, out_features=256, bias=True)
    (Va): Linear(in_features=256, out_features=1, bias=True)
  )
  (gru): GRU(512, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=13334, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


# Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input).unsqueeze(1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
import zipfile
import os

zip_file_path = '/content/data.zip'
extract_path = '/content/data_extracted'


os.makedirs(extract_path, exist_ok=True)


with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:

    print("Contents of the zip file:", zip_ref.namelist())
    zip_ref.extractall(extract_path)

print(f"File extracted to: {extract_path}")


extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

Contents of the zip file: ['data/', 'data/eng-fra.txt', 'data/names/', 'data/names/Arabic.txt', 'data/names/Chinese.txt', 'data/names/Czech.txt', 'data/names/Dutch.txt', 'data/names/English.txt', 'data/names/French.txt', 'data/names/German.txt', 'data/names/Greek.txt', 'data/names/Irish.txt', 'data/names/Italian.txt', 'data/names/Japanese.txt', 'data/names/Korean.txt', 'data/names/Polish.txt', 'data/names/Portuguese.txt', 'data/names/Russian.txt', 'data/names/Scottish.txt', 'data/names/Spanish.txt', 'data/names/Vietnamese.txt']
File extracted to: /content/data_extracted
Extracted files: ['data']


#encoder

In [None]:
import torch.nn as nn

class EncoderRNN(nn.Module):
  def __init__(self,input_size,hidden_size,dropout_p=0.1, padding_idx=0):
    super(EncoderRNN,self).__init__()
    self.hidden_size=hidden_size

    self.embedding=nn.Embedding(input_size,hidden_size, padding_idx=padding_idx)
    self.gru=nn.GRU(hidden_size,hidden_size,batch_first=True)
    self.dropout=nn.Dropout(dropout_p)

  def forward(self,input, input_lengths):
      embedded=self.dropout(self.embedding(input))



      packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu(), batch_first=True, enforce_sorted=True)

      output,hidden=self.gru(packed_embedded)



      output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True, padding_value=self.embedding.padding_idx, total_length=input.size(1))

      return output,hidden

#decoder

In [None]:
class DecoderRNN(nn.Module):
  def __init__(self,hidden_size,output_size):
    super(DecoderRNN,self).__init__()
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
    batch_size=encoder_outputs.size(0)
    decoder_input=torch.empty(batch_size,1,dtype=torch.long,device=device).fill(SOS_token)
    decoder_hidden=encoder_hidden
    decoder_outputs=[]

    for i in range(MAX_LENGTH):
        decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
        decoder_outputs.append(decoder_output)

        if target_tensor is not None:
            decoder_input = target_tensor[:, i].unsqueeze(1)
        else:
            _, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze(-1).detach()

    decoder_outputs = torch.cat(decoder_outputs, dim=1)
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
    return decoder_outputs, decoder_hidden, None

  def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

#Attention decoder

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward (self,query,keys):
      scores=self.Va(torch.tanh(self.Wa(query)+self.Ua(keys)))
      scores=scores.squeeze(2).unsqueeze(1)

      weights=F.softmax(scores,dim=1)
      context=torch.bmm(weights,keys)

      return context, weights

class AttenDecoder(nn.Module):
  def __init__(self,hidden_size,output_size,dropout_p=0.1, padding_idx=0):
    super(AttenDecoder,self).__init__()

    self.embedding=nn.Embedding(output_size,hidden_size, padding_idx=padding_idx)
    self.attention=BahdanauAttention(hidden_size)
    self.gru=nn.GRU(2*hidden_size,hidden_size,batch_first=True)
    self.out=nn.Linear(hidden_size,output_size)
    self.dropout=nn.Dropout(dropout_p)

  def forward(self,encoder_outputs,encoder_hidden,target_tensor=None):
    batch_size=encoder_outputs.size(0)
    decoder_input=torch.empty(batch_size,1,dtype=torch.long,device=device).fill_(SOS_token)
    decoder_hidden=encoder_hidden
    decoder_outputs=[]
    attentions=[]

    for i in range(MAX_LENGTH):

      if target_tensor is not None and i >= target_tensor.size(1):
          break

      decoder_output,decoder_hidden,attn_weights=self.forward_step(
          decoder_input,decoder_hidden,encoder_outputs
      )

      decoder_outputs.append(decoder_output)
      attentions.append(attn_weights)

      if target_tensor is not None:
        decoder_input=target_tensor[:,i].unsqueeze(1)
      else:
        _,topi=decoder_output.topk(1)
        decoder_input=topi.squeeze(-1).detach()

    decoder_outputs = torch.cat(decoder_outputs, dim=1)
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

    return decoder_outputs, decoder_hidden, attentions


  def forward_step(self,input,hidden,encoder_outputs):
    embedded=self.dropout(self.embedding(input))

    query=hidden.permute(1,0,2)
    context,attn_weights=self.attention(query,encoder_outputs)
    input_gru=torch.cat((embedded,context),dim=2)

    output,hidden=self.gru(input_gru,hidden)
    output=self.out(output)

    return output,hidden,attn_weights

#Training


In [None]:
#first i will append EOS token to both sequences to get input tensorr and target tensor

def indexesFromSentence(lang,sentence):
  return[lang.word2index[word]for word in sentence.split(' ')]
def tensorFromSentence(lang,sentence):
  indexes=indexesFromSentence(lang,sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes,dtype=torch.long,device=device)
def tensorsFromPair(pair, input_lang, output_lang):
    input_tensor=tensorFromSentence(input_lang,pair[0])
    target_tensor=tensorFromSentence(output_lang,pair[1])
    return(input_tensor,target_tensor)

def collate_fn(batch):
    input_batch = [item[0] for item in batch]
    target_batch = [item[1] for item in batch]


    input_lengths = torch.tensor([len(seq) for seq in input_batch], dtype=torch.long, device=device)

    # Pad sequences
    input_batch = torch.nn.utils.rnn.pad_sequence(input_batch, batch_first=True, padding_value=0)
    target_batch = torch.nn.utils.rnn.pad_sequence(target_batch, batch_first=True, padding_value=0)

    input_lengths, sorted_idx = input_lengths.sort(descending=True)
    input_batch = input_batch[sorted_idx]
    target_batch = target_batch[sorted_idx]


    return input_batch.to(device), target_batch.to(device), input_lengths

def getdataloader(batch_size, input_lang, output_lang, pairs):
    n=len(pairs)


    train_data = [tensorsFromPair(pair, input_lang, output_lang) for pair in pairs]

    train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=RandomSampler(train_data), collate_fn=collate_fn)
    return train_dataloader


#now its time  for actual training process

In [None]:
def train_epoch(dataloader,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion):
  total_loss=0
  for data in dataloader:
    print(f"Data from dataloader: {data}")
    input_tensor = data[0] # Access input tensor by index
    target_tensor = data[1] # Access target tensor by index
    input_lengths = data[2] # Access input lengths by index

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs,encoder_hidden=encoder(input_tensor, input_lengths) # Pass input_lengths to encoder
    decoder_outputs,_, _ =decoder(encoder_outputs,encoder_hidden,target_tensor)
    loss=criterion(
        decoder_outputs.view(-1,decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss+=loss.item()

  return total_loss/len(dataloader)

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def train(train_dataloader,encoder,decoder,n_epochs,learning_rate=0.01,print_every=100,plot_every=100):

  start=time.time()
  plot_losses=[]

  print_loss_total=0  # Initialized print_loss_total
  plot_loss_total=0   # Initialized plot_loss_total

  encoder_optimizer=optim.Adam(encoder.parameters(),lr=learning_rate)
  decoder_optimizer=optim.Adam(decoder.parameters(),lr=learning_rate)
  criterion=nn.NLLLoss()

  for epoch in range(1,n_epochs+1):
    loss=train_epoch(train_dataloader,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion)
    print_loss_total+=loss
    plot_loss_total+=loss

    if epoch % print_every == 0:
      print_loss_avg=print_loss_total/print_every
      print_loss_total=0
      print('%s (%d %d%%) %4f'% (timeSince(start,epoch/n_epochs),
                                 epoch,epoch/n_epochs*100, print_loss_avg))

    if epoch % plot_every ==0:
      plot_loss_avg=plot_loss_total/plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total=0



In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

#evaluation

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        # Get the input length for the single sentence
        input_length = torch.tensor([input_tensor.size(0)], device=device)

        # Pass input_length to the encoder
        encoder_outputs, encoder_hidden = encoder(input_tensor.unsqueeze(0), input_length)

        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('')
            else:
                decoded_words.append(output_lang.index2word[idx.item()])

        return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
batch_size=64
hidden_size=128
train_dataloader = getdataloader(batch_size, input_lang, output_lang, pairs)
encoder=EncoderRNN(input_lang.n_words,hidden_size, padding_idx=0).to(device)
decoder=AttenDecoder(hidden_size,output_lang.n_words, padding_idx=0).to(device)
train(train_dataloader,encoder,decoder,80,print_every=5,plot_every=5)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
             0,     0,     0,     0],
        [ 1667,   765,  5957,  1865,    37,     1,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ 2176,  2481,   358,     3,    37,     1,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ 1041,  2240,  2269,     3,    37,     1,     0,     0,     0,     0,
             0,     0,     0,     0],
        [   23,  1424,   815,    14,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ 1667,   124,  1546,    37,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [   63,  2040,   389,    14,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  198,  8500,    14,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ 2050,     3,     4,     1,     0,     0,     0,     0,     0,     0,

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> je veux vraiment que ce travail soit fait .
= i really want to get this work done .
< i really want to do this work .  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> contemoi ton histoire . je suis tout ouie .
= tell me your story . i am all ears .
< im amazed how hungry felt your wife .  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> que pensezvous de mon costume  ?
= what do you think of my costume ?
< what do you think about my costume ?  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> les enfants sont decus .
= the kids are disappointed .
< the new ones below zero .  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> nous organisames une fete dadieu pour elle .
= we organized a farewell party for her .
< she took a gentle party .  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> je suis pret a y aller maintenant .
= im ready to go now .
< im ready now in the job .  SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> mange tous tes epinards .
= eat up all your spinach !
< 