In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cp /content/drive/MyDrive/DL/Project/raw.zip .

In [None]:
!unzip raw.zip

Archive:  raw.zip
   creating: raw/
  inflating: raw/french.txt          
  inflating: raw/english.txt         
  inflating: raw/fra.txt             


In [None]:
import pandas as pd
from typing import List
import torch

In [None]:
with open("./raw/english.txt", "r") as e_open:
    with open("./raw/french.txt", "r") as f_open:
        ''' Delimiter lambda function '''
        func = lambda txt : txt[:-1]
        eng, fr = list(map(func, e_open.readlines())), list(map(func, f_open.readlines()))
        assert len(eng) == len(fr), "Lengths of source and target must be same"
        
        data = {'eng' : eng, 'fr' : fr}
        ds = pd.DataFrame(data = data)
        
        ''' Write the data to a csv file '''
        ds.to_csv("./eng-fr.csv")

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
def readLangs(df, lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    # Split every line into pairs and normalize
    pairs = [[normalizeString(inp), normalizeString(trg)] for _, inp, trg in df.itertuples()]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 30

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

In [None]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
        # p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(df, lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(df, lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [None]:
input_lang, output_lang, pairs = prepareData(ds, 'eng', 'fra', False)
print(random.choice(pairs))

Reading lines...
Read 170651 sentence pairs
Trimmed to 170600 sentence pairs
Counting words...
Counted words:
eng 13737
fra 22586
['tom is here for the weekend .', 'tom est la pour le week end .']


In [None]:
# _, _, pairs=readLangs(x,'eng', 'fra')

Reading lines...


In [None]:
pairs[:10]

[['go .', 'va !'],
 ['hi .', 'salut !'],
 ['hi .', 'salut .'],
 ['run !', 'cours !'],
 ['run !', 'courez !'],
 ['who ?', 'qui ?'],
 ['wow !', 'ca alors !'],
 ['fire !', 'au feu !'],
 ['help !', 'a l aide !'],
 ['jump .', 'saute .']]

In [None]:
filterPair(pairs[0])

True

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.5

In [None]:
training_pairs = tensorsFromPair(random.choice(pairs))

In [None]:
training_pairs

(tensor([[ 47],
         [643],
         [227],
         [562],
         [276],
         [238],
         [337],
         [  3],
         [  1]], device='cuda:0'), tensor([[ 102],
         [  88],
         [  14],
         [ 165],
         [  76],
         [5823],
         [ 750],
         [   5],
         [   1]], device='cuda:0'))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 64
train_loader = DataLoader(
    training_pairs,
    sampler=RandomSampler(training_pairs),
    batch_size=batch_size,
    drop_last=True
)


In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    # model.train()
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f'{timeSince(start, iter / n_iters)} ({iter} -- {iter / n_iters * 100}%) {print_loss_avg:.4f}')

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    # model.eval()
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 170600, print_every=5000)

1m 57s (- 64m 45s) (5000 -- 2.9308323563892147%) 4.7653
3m 47s (- 60m 57s) (10000 -- 5.861664712778429%) 4.2278
5m 39s (- 58m 37s) (15000 -- 8.792497069167643%) 3.9511
7m 31s (- 56m 42s) (20000 -- 11.723329425556859%) 3.7830
9m 24s (- 54m 49s) (25000 -- 14.654161781946073%) 3.6443
11m 18s (- 53m 1s) (30000 -- 17.584994138335286%) 3.5431
13m 12s (- 51m 11s) (35000 -- 20.5158264947245%) 3.4790
15m 7s (- 49m 23s) (40000 -- 23.446658851113718%) 3.4222
17m 0s (- 47m 28s) (45000 -- 26.377491207502928%) 3.3284
18m 54s (- 45m 37s) (50000 -- 29.308323563892145%) 3.2946
20m 48s (- 43m 44s) (55000 -- 32.23915592028136%) 3.2390
22m 43s (- 41m 52s) (60000 -- 35.16998827667057%) 3.2077
24m 36s (- 39m 58s) (65000 -- 38.10082063305979%) 3.2214
26m 30s (- 38m 5s) (70000 -- 41.031652989449%) 3.1747
28m 24s (- 36m 12s) (75000 -- 43.96248534583822%) 3.1458
30m 18s (- 34m 19s) (80000 -- 46.893317702227435%) 3.1426
32m 12s (- 32m 26s) (85000 -- 49.824150058616645%) 3.1093
34m 5s (- 30m 32s) (90000 -- 52.754

In [None]:
for epoch in range(2,4):
    print(f"\n\nEpoch {epoch}")
    trainIters(encoder1, attn_decoder1, 85000, print_every=5000)



Epoch 2
1m 55s (- 30m 44s) (5000 -- 5.88235294117647%) 3.6847
3m 44s (- 28m 6s) (10000 -- 11.76470588235294%) 3.7953
5m 34s (- 26m 1s) (15000 -- 17.647058823529413%) 3.6921
7m 24s (- 24m 3s) (20000 -- 23.52941176470588%) 3.6455
9m 13s (- 22m 8s) (25000 -- 29.411764705882355%) 3.6099
11m 2s (- 20m 14s) (30000 -- 35.294117647058826%) 3.5232
12m 51s (- 18m 22s) (35000 -- 41.17647058823529%) 3.4861
14m 40s (- 16m 30s) (40000 -- 47.05882352941176%) 3.5124
16m 31s (- 14m 41s) (45000 -- 52.94117647058824%) 3.5377
18m 20s (- 12m 50s) (50000 -- 58.82352941176471%) 3.4783
20m 9s (- 10m 59s) (55000 -- 64.70588235294117%) 3.4910
21m 59s (- 9m 9s) (60000 -- 70.58823529411765%) 3.4526
23m 48s (- 7m 19s) (65000 -- 76.47058823529412%) 3.4311
25m 39s (- 5m 29s) (70000 -- 82.35294117647058%) 3.4595
27m 31s (- 3m 40s) (75000 -- 88.23529411764706%) 3.4872
29m 21s (- 1m 50s) (80000 -- 94.11764705882352%) 3.3878
31m 12s (- 0m 0s) (85000 -- 100.0%) 3.4026


Epoch 3
1m 54s (- 30m 26s) (5000 -- 5.88235294117

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> he bores everybody .
= il ennuie tout le monde .
< il tout le monde . <EOS>

> i don t go to school anymore .
= je ne vais plus a l ecole .
< je ne vais pas avec moi . <EOS>

> i was impressed with her work .
= je fus impressionne par son travail .
< j ai ete avec son travail . <EOS>

> there is no hope .
= il n y a pas d espoir .
< il n y pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas

> tom came yesterday .
= tom est venu hier .
< tom hier de hier . <EOS>

> i m very grateful for what you ve done .
= je suis tres reconnaissante de ce que tu as fait .
< je vous ai fait pour vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous vous

> she wants to meet him again .
= elle veut le rencontrer a nouveau .
< elle veut de nouveau . <EOS>

> this rug was made without the use of child labor .
= ce tapis est confectionne sans employer d enfants .
< ce que etait sans 

In [None]:
for param_tensor in attn_decoder1.state_dict():
    print(param_tensor, "\t",attn_decoder1.state_dict()[param_tensor].size())

embedding.weight 	 torch.Size([22586, 256])
attn.weight 	 torch.Size([30, 512])
attn.bias 	 torch.Size([30])
attn_combine.weight 	 torch.Size([256, 512])
attn_combine.bias 	 torch.Size([256])
gru.weight_ih_l0 	 torch.Size([768, 256])
gru.weight_hh_l0 	 torch.Size([768, 256])
gru.bias_ih_l0 	 torch.Size([768])
gru.bias_hh_l0 	 torch.Size([768])
out.weight 	 torch.Size([22586, 256])
out.bias 	 torch.Size([22586])


In [None]:
torch.save(encoder1.state_dict(), "fr_encoder.pth")

In [None]:
torch.save(encoder1.state_dict(), "fr_decoder.pth")

In [None]:
!cp fr_* "/content/drive/MyDrive/DL/Project/weights/MTL_FR" 

In [None]:
!ls fr_*

fr_decoder.pth	fr_encoder.pth
