In [1]:
import pandas as pd
import numpy as np

from nltk import word_tokenize
from sklearn.model_selection import train_test_split

from collections import defaultdict

In [2]:
dataset = pd.read_csv('questions.csv')
dataset.question1 = dataset.question1.str.lower()
dataset.question2 = dataset.question2.str.lower()

train, test_set = train_test_split(dataset, test_size = 0.2, shuffle = True)

In [3]:
print("Train and val set size : {}".format(len(train)))
print("Test set size : {}".format(len(test_set)))

Train and val set size : 323480
Test set size : 80871


In [4]:
data_to_train_on = train[train['is_duplicate'] == 1]

print("Total data to train and validate on : {}".format(len(data_to_train_on)))

Total data to train and validate on : 119506


In [5]:
train_set, val_set = train_test_split(data_to_train_on, test_size = 0.2, shuffle = True)

print("Train set size : {}".format(len(train_set)))
print("Val set size : {}".format(len(val_set)))

Train set size : 95604
Val set size : 23902


In [6]:
def preprocess_sentence(sent):
    return word_tokenize(sent)

In [7]:
q1, q2 = train_set.question1.values, train_set.question2.values

In [8]:
q1_list = []
q2_list = []

vocab_dict = defaultdict(int)
vocab_dict['<pad>'] = 1

for sentence1, sentence2 in zip(q1, q2):
    tokens1 = word_tokenize(sentence1)
    tokens2 = word_tokenize(sentence2)
    
    q1_list.append(tokens1)
    q2_list.append(tokens2)
    
    for word in tokens1 + tokens2:
        if vocab_dict[word] == 0:
            vocab_dict[word] = len(vocab_dict)

In [9]:
print("Number of words in vocabulary : {}".format(len(vocab_dict)))

Number of words in vocabulary : 27975


In [10]:
from torch.utils.data import Dataset, DataLoader

In [11]:
class DQDataset(Dataset):
    def __init__(self, q1, q2, vocab):
        self.q1 = q1
        self.q2 = q2
        self.vocab = vocab
        
    def __len__(self):
        return len(self.q1)
    
    def __getitem__(self, idx):
        input1 = [self.vocab[word] for word in self.q1[idx]]
        input2 = [self.vocab[word] for word in self.q2[idx]]
        
        return {'q1' : input1, 'q2' : input2}

In [12]:
dset = DQDataset(q1_list, q2_list, vocab_dict)

In [13]:
import torch

In [14]:
def collate_fn(batch):
    q1 = []
    q2 = []
    q1_len = []
    q2_len = []
    
    bs = len(batch)
    
    for questions in batch:
        q1.append(questions['q1'])
        q2.append(questions['q2'])
        
        q1_len.append(len(questions['q1']))
        q2_len.append(len(questions['q2']))
    
    max_len = max(max(q1_len), max(q2_len))
    
    q1_batch = np.ones([bs, max_len], dtype=np.long)
    q2_batch = np.ones([bs, max_len], dtype=np.long)
    
    for i, (in1, in2) in enumerate(zip(q1, q2)):
        q1_batch[i, :q1_len[i]] = in1
        q2_batch[i, :q2_len[i]] = in2
        
    return {'q1' : torch.LongTensor(q1_batch), 'q2' : torch.LongTensor(q2_batch)}

In [15]:
dl = DataLoader(dset, batch_size=32, collate_fn=collate_fn)

In [16]:
next(iter(dl))

{'q1': tensor([[  2,   3,   4,   5,   6,   7,   8,   9,  10,   4,   5,  11,  12,  13,
           14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [ 18,  19,  20,  21,  22,  14,   1,   1,   1,   1,   1,   1,   1,   1,
            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [ 25,  26,  27,  28,  29,  23,  30,  31,  14,   1,   1,   1,   1,   1,
            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [ 25,  32,  27,  33,  34,  35,  12,  34,  36,  37,  14,   1,   1,   1,
            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [ 25,  43,  27,  46,  47,  48,  14,   1,   1,   1,   1,   1,   1,   1,
            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [ 13,  50,  51,  52,  27,  16,  41,  53,  54,  55,  53,  56,  57,  58,
           14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
         [  2,  61,  62,  63,  64,  12,  65,   9,  25,  59,  56,  66,  67,  68,
        