# Machine Translation using Sequence to Sequence

In [2]:
# Importing required libraries

In [3]:
import os
import re
import string
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

In [4]:
# readin dataset

In [5]:
df = pd.read_table("./data/cmn.txt", names=['eng', 'chin', 'info'])
df.head()

Unnamed: 0,eng,chin,info
0,Hi.,嗨。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Hi.,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run.,你用跑的。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
3,Wait!,等等！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
4,Wait!,等一下！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [6]:
# getting shape of dataset
df.shape

(24026, 3)

In [7]:
# we do not need info so only taking eng and chin features

In [8]:
df = df[["eng", "chin"]]

In [9]:
df.head()

Unnamed: 0,eng,chin
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Wait!,等等！
4,Wait!,等一下！


# Data Preprocessing

We are folowing some of the give preprocessing steps:
* Removing punctuations like . , ! $( ) * % @
* Removing URLs
* Removing Stop words
* Lower casing
* Tokenization
* Stemming
* Lemmatization

#### Removing Punctuation

In [10]:
# getting punctuation form string
set_of_punctuation = set(string.punctuation)
print(set_of_punctuation)

{'|', '$', ';', '@', '\\', '*', '#', '<', '!', '?', '+', '.', ')', '[', '&', "'", '_', '{', '~', '-', '>', '"', ']', '}', ',', ':', '^', '(', '%', '=', '`', '/'}


In [11]:
# define a function to remove punctuation form whole dataset
def remove_punctuation(text):
    punctuation_free_text = ''.join([char for char in text if char not in set_of_punctuation])
    return punctuation_free_text

In [12]:
#example
temp = df["eng"].apply(lambda x: remove_punctuation(x))
temp.head()

0      Hi
1      Hi
2     Run
3    Wait
4    Wait
Name: eng, dtype: object

#### Removing Uneven Spaces

In [13]:
# define a function to remove uneven spaces form text
def remove_uneven_spaces(text):
    uneven_space_free = re.sub(r"\s+", ' ', text.strip())
    return uneven_space_free

In [14]:
# example
text = "  This  is   an example    text   with uneven   spaces.  "
result = df["eng"].apply(lambda x: remove_uneven_spaces(text))
result.head()

0    This is an example text with uneven spaces.
1    This is an example text with uneven spaces.
2    This is an example text with uneven spaces.
3    This is an example text with uneven spaces.
4    This is an example text with uneven spaces.
Name: eng, dtype: object

#### Lower casing

In [15]:
# define a functio to convert text into lower case
def convert_lowercase(text):
    return text.lower()

In [16]:
# example
text = "My Name "

In [17]:
# Implementing all process for dataset

In [18]:
df["eng"] = df["eng"].apply(lambda x: convert_lowercase(x))
df["chin"] = df["chin"].apply(lambda x: convert_lowercase(x))

df["eng"] = df["eng"].apply(lambda x: remove_punctuation(x))
df["chin"] = df["chin"].apply(lambda x: remove_punctuation(x))

df["eng"] = df["eng"].apply(lambda x: remove_uneven_spaces(x))
df["chin"] = df["chin"].apply(lambda x: remove_uneven_spaces(x))

#### Tokenization

Tokenizing the English and the Chinese words in to set all_english_vocabs and all_chinese_vocabs.

In [19]:
all_english_vocabs = set()
for english in df["eng"]:
    words = english.split()
    for word in words:
        if word not in all_english_vocabs:
            all_english_vocabs.add(word)

In [20]:
all_chinese_vocabs = set()
for chinese in df["chin"]:
    words = chinese.split()
    for word in words:
        if word not in all_chinese_vocabs:
            all_chinese_vocabs.add(word)

# Model Building

Here, we are going to build encoder and decoder models seperatly and ensembel them to form seq2seq model


#### Building Encoder

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
        super(Encoder, self).__init__()
        
        self.hidden_dim = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
        
    
    def forward(self, x):
        # x.shape : (x.length, batch_size)
        embedding = self.dropout(self.embedding(x))
        # embedding : (x.length, batch_size, embed_dim)
        
        output, (hidden, cell)= self.lstm(embedding)
        return hidden, cell

#### Building Decoder

In [22]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embeddig_size, output_size, num_layers, dropout_p):
        super(Decoder, self).__init__()
        
        self.hidden_dim = hidden_size
        self.input_dim = input_size
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x,hidden, cell):
        # input = [batch size]
        x = x.unsqueeze(0)
        # x: [1, batch_size]
        
        embedding = self.dropout(self.embedding(x))
        #embedded = [1, batch size, emb dim]
        
        output, (hidden, cell) = self.lsmt(embedding)
        # output : [1, batch_size, hidden_size]
        
        prediction = self.fc(output)
        # predction: [1, N, batch_size, length_of_vocab]
        
        prediction = prediction.squeeze(0)
        
        return prediction, hidden, cell

#### Building Seq2Seq model

In [23]:
class Seq2SeqNet(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqNet, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        
        # getting batch_size, target_len and vocab_length
        batch_size = source[1]
        target_len = source[0]
        target_vocab_len = len(all_english_vocabs)
        
        # variable to hold output
        outputs = torch.zeros(target_len, batch_size)
        
        # Encoder
        hidden, cell = self.encoder(source)
        
        # Decoder
        #first input to the decoder is the <sos> tokens
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #get the highest predicted token from our predictions
            best_prediction = output.argmax(1)
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            x = target[t] if teacher_force else top1

In [27]:
INPUT_DIM = len(english.vocab)
OUTPUT_DIM = len(chines.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

AttributeError: 'str' object has no attribute 'vocab'

In [28]:
# Hyperparameters

learning_rate = 0.001
step = 0

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

NameError: name 'INPUT_DIM' is not defined

# Training Model