In [21]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import re

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
with open('paraphrase_data.txt', encoding="utf-8") as f:
    data = f.read()

In [24]:
def normalize(s):
    s = re.sub(r"([.!?])", r" \1", s.lower())
    s = re.sub(r"[^a-zA-Z]+", r" ", s)
    return s

In [25]:
sentences = []

for line in data.split('\n')[1:]:
    temp = line.split('\t')
    if temp[0] == '0':
        continue
        
    sentences.append((normalize(temp[-2]), normalize(temp[-1])))

In [26]:
sentences

[('amrozi accused his brother whom he called the witness of deliberately distorting his evidence ',
  'referring to him as only the witness amrozi accused his brother of deliberately distorting his evidence '),
 ('they had published an advertisement on the internet on june offering the cargo for sale he added ',
  'on june the ship s owners had published an advertisement on the internet offering the explosives for sale '),
 ('the stock rose or about percent to close friday at on the new york stock exchange ',
  'pg e corp shares jumped or percent to on the new york stock exchange on friday '),
 ('revenue in the first quarter of the year dropped percent from the same period a year earlier ',
  'with the scandal hanging over stewart s company revenue the first quarter of the year dropped percent from the same period a year earlier '),
 ('the dvd cca then appealed to the state supreme court ',
  'the dvd cca appealed that decision to the u s supreme court '),
 ('he said the foodservice pi

In [28]:
word_to_ix = {'SOS': 0, 'EOS': 1}
ix_to_word = {0: 'SOS', 1: 'EOS'}

for sen in sentences:
    for word in sen[0].split():    
        if word in word_to_ix:
            continue
        else:
            word_to_ix[word] = len(word_to_ix)
            ix_to_word[len(ix_to_word)] = word
            
    for word in sen[1].split():    
        if word in word_to_ix:
            continue
        else:
            word_to_ix[word] = len(word_to_ix)
            ix_to_word[len(ix_to_word)] = word

In [55]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.net = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.net(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [31]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=256):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.net = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.net(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [56]:
enc = Encoder(len(word_to_ix), 128)
dec = Decoder(128, len(word_to_ix))

In [57]:
sample = []

for word in sentences[0][1].split():
    sample.append(word_to_ix[word])
    
    
ten = torch.from_numpy(np.array(sample, dtype=np.int64)).view(1, -1)

In [58]:
sample

[15, 16, 17, 18, 19, 9, 10, 2, 3, 4, 5, 11, 12, 13, 4, 14]

In [54]:
encoder_hidden.shape

torch.Size([1, 1, 128])