In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## Load file

In [19]:
train_file = 'quora-question-pairs/train.csv'
test_file = 'quora-question-pairs/test.csv'

In [20]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [21]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Tokenization

In [24]:
def normalizeString(s):
    # Lowercase, trim, and remove non-letter charactersb
    s = str(s).lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return [w for w in s.split() if len(w) > 1]

In [25]:
x = 'RNN is a network that operates on a sequence and uses its own output as input for subsequent steps.'
normalizeString(x)

['rnn',
 'is',
 'network',
 'that',
 'operates',
 'on',
 'sequence',
 'and',
 'uses',
 'its',
 'own',
 'output',
 'as',
 'input',
 'for',
 'subsequent',
 'steps']

In [26]:
train_df['words1'] = train_df['question1'].apply(normalizeString)
train_df['words2'] = train_df['question2'].apply(normalizeString)

In [27]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,words1,words2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, is, the, story, of, kohinoor, koh, noor...","[what, would, happen, if, the, indian, governm..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, can, increase, the, speed, of, my, inter...","[how, can, internet, speed, be, increased, by,..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[why, am, mentally, very, lonely, how, can, so...","[find, the, remainder, when, math, math, is, d..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]"


## Generate Vocab

In [35]:
q1 = train_df['words1'].tolist()
q2 = train_df['words2'].tolist()

In [48]:
import itertools

w1 = list(itertools.chain(*q1))
w2 = list(itertools.chain(*q2))
ws = w1 + w2

In [49]:
len(ws)

8424308

In [50]:
word_count = Counter()
word_count.update(ws)

In [51]:
len(word_count)

79967

In [52]:
# delete if occurs < 5 times and it is not in our pretrained embeddings
for word in list(word_count):
    if word_count[word] < 5 and word not in word_vecs:
        del word_count[word]

In [53]:
len(word_count)

60168

In [55]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in word_count.keys():
    vocab2index[word] = len(words)
    words.append(word)

In [56]:
len(words)

60170

## Load pre-trained embeddings

In [33]:
def loadGloveModel(gloveFile="glove.6B/glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    f.close()
    return word_vecs

In [34]:
word_vecs = loadGloveModel()

In [44]:
len(word_vecs.keys())

400000

## Pre-trained weights for the embedding layer

In [60]:
def random_word_vector(D=50):
    """Create arandom word vector
    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    return np.random.uniform(-0.25,0.25,D)

In [61]:
def create_embedding_matrix(word_vecs, vocab2index, words, D=50):
    """Creates embedding matrix from word vectors. """
    V = len(words)
    W = np.zeros((V, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for i in range(1, V):
        if words[i] in word_vecs:
            W[i] = word_vecs[words[i]]
        else:
            W[i] = random_word_vector()
    return W

In [62]:
embedding_matrix = create_embedding_matrix(word_vecs, vocab2index, words)
embedding_matrix.shape

(60170, 50)

## Dataset

In [75]:
def encode_sentence(sentence, vocab2index, N=20, padding_start=True):
    x = normalizeString(sentence)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc

In [76]:
sample = train_df.loc[0,'question1']
sample

'What is the step by step guide to invest in share market in india?'

In [77]:
encode_sentence(sample, vocab2index, N=20)

array([ 2,  3,  4,  5,  6,  5,  7,  8,  9, 10, 11, 12, 10, 13,  0,  0,  0,
        0,  0,  0], dtype=int32)

In [80]:
class QuoraDataset(Dataset):
    def __init__(self, df, N=20, padding_start=True):
        q1 = df['question1'].tolist()
        q2 = df['question2'].tolist()
        X1 = [encode_sentence(q, vocab2index, N, padding_start) for q in q1]
        X2 = [encode_sentence(q, vocab2index, N, padding_start) for q in q2]
        self.X = np.array([list(X1[i])+list(X2[i]) for i in range(len(df))])
        self.y = df['is_duplicate'].values        
      
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [81]:
train_ds = QuoraDataset(train_df)

In [91]:
len(train_ds)

404290

In [87]:
train_ds[1]

(array([   2,    3,    4,   14,   15,   16,   17,   18,   19,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    2,  108,
         820,  302,    4,  427,  197, 4396,    4,   16,   17,   18,   19,
         243,    0,    0,    0,    0,    0,    0], dtype=int32), 0)

In [88]:
train_size = int(0.8 * len(train_ds))
test_size = len(train_ds) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(train_ds, [train_size, test_size])

In [89]:
len(train_dataset)

323432

In [90]:
len(test_dataset)

80858

In [92]:
batch_size = 10000
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(test_dataset, batch_size=batch_size)

## LSTM model

In [147]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [148]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in valid_dl:
        # s is not used here
        x = x.long()
        y = y.float().unsqueeze(1)
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [149]:
def train_epocs_v0(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            # s is not used in this model
            x = x.long()
            y = y.float()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [150]:
vocab_size = len(words)
vocab_size

60170

In [151]:
embedding_matrix.shape

(60170, 50)

In [152]:
model = LSTMModel(vocab_size, 50, 25, embedding_matrix)

In [153]:
train_epocs_v0(model, epochs=20, lr=0.01)

train loss 0.627 val loss 0.610 and val accuracy 0.661
train loss 0.571 val loss 0.575 and val accuracy 0.689
train loss 0.555 val loss 0.564 and val accuracy 0.702
train loss 0.549 val loss 0.547 and val accuracy 0.716


In [None]:
train_epocs_v0(model, epochs=10, lr=0.005)

## Prediction

In [154]:
class TestDataset(Dataset):
    def __init__(self, df, N=20, padding_start=True):
        q1 = df['question1'].tolist()
        q2 = df['question2'].tolist()
        X1 = [encode_sentence(q, vocab2index, N, padding_start) for q in q1]
        X2 = [encode_sentence(q, vocab2index, N, padding_start) for q in q2]
        self.X = np.array([list(X1[i])+list(X2[i]) for i in range(len(df))])     
      
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx]

In [155]:
test_ds = TestDataset(test_df, padding_start=False)
batch_size = 10000
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [None]:
prediction = []
for x in test_dl:
    # s is not used in this model
    x = x.long()
    y_pred = model(x)
    prediction.extend(y_pred)

In [None]:
test_df['is_duplicate'] = prediction
