In [1]:
import pandas as pd
import re
import os
import numpy as np
from string import punctuation
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import random
import tqdm

In [2]:
file_path = '/content/rus.txt'

In [3]:
df = pd.read_csv(file_path, delimiter = '\t', header = None)

In [4]:
df = df.iloc[:,:2]

In [5]:
df.rename(columns={0: "English", 1: "Russian"},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={0: "English", 1: "Russian"},inplace=True)


In [6]:
df.columns

Index(['English', 'Russian'], dtype='object')

In [7]:
df.head()

Unnamed: 0,English,Russian
0,Go.,Марш!
1,Go.,Иди.
2,Go.,Идите.
3,Hi.,Здравствуйте.
4,Hi.,Привет!


In [8]:
def CLEAN_DATA(data):
    # Replace no-break space with space
    data = data.replace("\u202f"," ")

    # converting the uppper case data to the lower case data
    data =  data.lower()

    #removing all punctuations
    for p in punctuation+"«»"+"0123456789":
        data = data.replace(p,"")

    # removing duplicate white space
    data = re.sub("\s+"," ",data)

    # Remove spaces at the beginning and at the end of the string
    data = data.strip()

    #add <sos> and <eos> in the data
    data = "<sos> " + data + " <eos>"

    return data



In [9]:
# applying the above functions to the data
df["English"] = df["English"].astype(str)
df["Russian"] = df["Russian"].astype(str)
df["English"] = df["English"].apply(CLEAN_DATA)
df["Russian"] = df["Russian"].apply(CLEAN_DATA)

In [10]:
df

Unnamed: 0,English,Russian
0,<sos> go <eos>,<sos> марш <eos>
1,<sos> go <eos>,<sos> иди <eos>
2,<sos> go <eos>,<sos> идите <eos>
3,<sos> hi <eos>,<sos> здравствуйте <eos>
4,<sos> hi <eos>,<sos> привет <eos>
...,...,...
363381,<sos> in todays world we have to equip all our...,<sos> в современном мире перед нами стоит зада...
363382,<sos> death is something that were often disco...,<sos> смерть это зачастую то разговоры или даж...
363383,<sos> at a moment when our economy is growing ...,<sos> в тот момент когда наша экономика растёт...
363384,<sos> since there are usually multiple website...,<sos> поскольку сайтов посвящённых какойлибо т...


In [11]:
eng_data = df["English"].values

In [12]:
rus_data = df["Russian"].values

In [13]:
def create_set(data):
    s = set()
    for i in data:
        for j in i.split(" "):
            s.add(j)
    return s

In [14]:
eng_data = create_set(eng_data)

In [15]:
rus_data = create_set(rus_data)

In [16]:
def word_to_idx(data):
    d = {}
    for i, word in enumerate(data):
        d[word] = i
    return d

In [17]:
# Here We are forming word to the idx relation
eng_to_idx = word_to_idx(eng_data)
rus_to_idx = word_to_idx(rus_data)

In [18]:
# for English
def eidx_based_encoding(string):
    array = []
    for i in string.split():
        array.append(eng_to_idx[i])
    return array


In [19]:
# for Russian
def ridx_based_encoding(string):
    array = []
    for i in string.split():
        array.append(rus_to_idx[i])
    return array

In [20]:
df["English"] = df["English"].apply(eidx_based_encoding)

In [21]:
df["Russian"] = df["Russian"].apply(ridx_based_encoding)

In [22]:
class Machine_translation(Dataset):
    def __init__(self, data_source, data_target):
        super(Machine_translation, self).__init__()
        self.data_source = data_source
        self.data_target = data_target

    def __len__(self):
        return len(self.data_source)

    def __getitem__(self, idx):
        source = torch.tensor(self.data_source[idx], dtype = torch.long)
        target = torch.tensor(self.data_target[idx], dtype = torch.long)
        return {"source":source , "target":target}

In [23]:
data_source = df["English"].values
data_target = df["Russian"].values

In [24]:
train_data = Machine_translation(data_source, data_target)

In [25]:
validation_size = int(len(train_data)*0.2)

In [26]:
train_data , validation_data = random_split(train_data, [len(train_data)-validation_size, validation_size])

In [27]:
load_train_data = DataLoader(train_data, batch_size = 1, shuffle = True)
load_validation_data = DataLoader(validation_data, batch_size = 1, shuffle = True)

In [28]:
# Encoder Block
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input_seq, hidden):
        input_seq = input_seq.unsqueeze(0)
        embedded = self.embedding(input_seq)
        output,hidden = self.lstm(embedded, hidden)
        return output, hidden

In [29]:
# Decoder Block
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden):
        input_seq = input_seq.unsqueeze(0)
        embedded = self.embedding(input_seq)
        output, hidden = self.lstm(embedded, hidden)
        prediction = self.out(output)
        return prediction, hidden

In [30]:
# Seq_to_Seq Model
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder =  decoder
        self.device = device

    def forward(self, src, trg, teacher_force_ratio):
        enc_inp_size = len(src[0])
        hidden = (torch.zeros(1, self.encoder.hidden_size).to(self.device), torch.zeros(1, self.encoder.hidden_size).to(self.device))
        dec_inp_size = len(trg[0])
        dec_inp = trg[0][0]

        #tensor to store decoder outputs
        outputs = torch.zeros(1, dec_inp_size).to(self.device)
        outputs[0][0] = dec_inp

        for t in range(enc_inp_size):
            out, hidden = encoder(src[0][t], hidden)

        for t in range(1 , dec_inp_size):
            prediction, hidden = decoder(dec_inp, hidden)

            outputs[0][t] = prediction.argmax(1)

            teacher_force = random.random() < teacher_force_ratio

            top = prediction.argmax(1).squeeze(0)

            dec_inp =  trg[0][t] if teacher_force else top

        return outputs

In [31]:
input_size = len(eng_data)
output_size = len(rus_data)
hidden_size = 256

train_data = load_train_data
validation_data = load_validation_data

encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
decoder = Decoder(hidden_size=hidden_size, output_size=output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = seq2seq(encoder=encoder, decoder=decoder, device=device).to(device)

optimizer = optim.Adam(model.parameters(), lr = 0.001)

criterion = nn.CrossEntropyLoss()

In [32]:
# Training Function
def train_func(model, train_data, optimizer, criterion, teacher_force_ratio, device):
    model.train()
    epoch_loss = 0
    for i , batch in enumerate(train_data):
        src = batch["source"].to(device)
        trg = batch["target"].to(device)
        output = model(src, trg, teacher_force_ratio)
        # print("out:", output, "trg:", trg)

        optimizer.zero_grad()

        loss = criterion(trg.to(torch.float), output)
        loss.backward

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss/len(train_data)


In [33]:
# Validation Function
def valid_fun(model, validation_data, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(validation_data):
            src = batch["source"].to(device)
            trg = batch["target"].to(device)
            output = model(src, trg, 0)

            loss = criterion(trg.to(torch.float), output)

            epoch_loss += loss.item()
    return epoch_loss/len(validation_data)

In [36]:
Epoch = 10
teacher_force_ratio = 0.75
best_valid_loss = float("inf")
for epoch in tqdm.tqdm(range(Epoch)):
    train_loss = train_func(model, train_data, optimizer, criterion, teacher_force_ratio, device)
    val_loss = valid_fun(model, validation_data, criterion)
    print(f"\tTrain Loss: {train_loss:7.3f}")
    print(f"\tValid Loss: {val_loss:7.3f}")
