In [0]:
import numpy as np 
import pandas as pd 
import re
from collections import Counter
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

In [2]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Preprocessing

In [0]:
# File paths
TRAIN_CSV = './data/train.csv'
TEST_CSV = './data/test.csv'
EMBEDDING_FILE = './data/glove.6B.50d.txt'
MODEL_SAVING_DIR = './model/'

In [0]:
# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stops = set(stopwords.words('english'))

In [0]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [0]:
# Prepare embedding
vocabulary = dict()
# '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
inverse_vocabulary = ['<unk>']

In [0]:
def loadGloveModel(gloveFile=EMBEDDING_FILE):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs, words

In [0]:
word_vecs,vocab = loadGloveModel()

In [0]:
questions_cols = ['question1', 'question2']

In [0]:
# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words
                if word in stops and word not in vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.at[index, question] = q2n

In [0]:
embedding_dim = 50
# This will be the embedding matrix
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in vocab:
        embeddings[index] = word_vecs[word]

In [0]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

In [15]:
max_seq_length

245

In [0]:
# Split to train validation
validation_size = 40000
training_size = len(train_df) - validation_size

X = train_df[questions_cols]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

In [17]:
training_size

364290

In [0]:
# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}

In [0]:
X_test = {'left': test_df.question1, 'right': test_df.question2}

In [0]:
# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

In [0]:
# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

In [0]:
# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

# Dataset and dataloader

In [0]:
class QuoraDataset(Dataset):
    def __init__(self,X,y):
        self.y = y
        self.X = X
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x1 = self.X['left'][idx]
        x2 = self.X['right'][idx]
        return [x1,x2], self.y[idx]

In [0]:
train_ds = QuoraDataset(X_train, Y_train)
valid_ds = QuoraDataset(X_validation, Y_validation)

In [25]:
train_ds[0]

([array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

# Model and Training

In [0]:
import sys

In [0]:
class Manhattan_LSTM(nn.Module):
    def __init__(self, hidden_size, embedding, use_embedding=False, train_embedding=True):
        super(Manhattan_LSTM, self).__init__()
        self.use_cuda = torch.cuda.is_available()
        self.hidden_size = hidden_size

        if use_embedding:
            self.embedding = nn.Embedding(embedding.shape[0], embedding.shape[1])
            self.embedding.weight.data.copy_(torch.from_numpy(embedding))
            # self.embedding.weight = nn.Parameter(embedding)
            self.input_size = embedding.shape[1] # V - Size of embedding vector

        else:
            self.embedding = nn.Embedding(embedding[0], embedding[1])
            self.input_size = embedding[1]

        self.embedding.weight.requires_grad = train_embedding

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bidirectional=False)
        # self.linear = nn.Linear(245, 1)

    def exponent_neg_manhattan_distance(self, x1, x2):
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=1))

    def forward(self, input1, input2):
        '''
        input           -> (2 x Max. Sequence Length (per batch) x Batch Size)
        hidden          -> (2 x Num. Layers * Num. Directions x Batch Size x Hidden Size)
        '''

        embedded_1 = self.embedding(input1) # L, B, V
        embedded_2 = self.embedding(input2) # L, B, V
        outputs_1, hidden_1 = self.lstm(embedded_1.permute([1, 0, 2]))
        outputs_2, hidden_2 = self.lstm(embedded_2.permute([1, 0, 2]))
        tmp = hidden_1[0][0].size()[0]
        similarity_scores = self.exponent_neg_manhattan_distance(hidden_1[0][0].view(tmp, -1),
                                                                 hidden_2[0][0].view(tmp, -1))
        return similarity_scores

In [0]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [0]:
def train_epocs_v0(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x1 = x[0].long().cuda()
            x2 = x[1].long().cuda()
            y = y.float().cuda()
            y_pred = model(x1,x2)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 2 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [0]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in valid_dl:
        x1 = x[0].long().cuda()
        x2 = x[1].long().cuda()
        y = y.float().cuda()
        y_pred = model(x1,x2)
        loss = F.mse_loss(y_pred, y)
        y_pred_01 = np.where(y_pred.cpu() > 0.5, 1, 0).astype(float)
        correct += (y_pred_01 == y.cpu().detach().numpy()).astype(float).sum().item()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [0]:
batch_size = 4000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

In [0]:
hidden_size = 10

In [0]:
model = Manhattan_LSTM(hidden_size, embeddings, use_embedding=True, train_embedding=True).cuda()
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [216]:
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.124 val loss 0.127 and val accuracy 0.822
train loss 0.096 val loss 0.119 and val accuracy 0.837
train loss 0.084 val loss 0.116 and val accuracy 0.843
train loss 0.076 val loss 0.114 and val accuracy 0.844
train loss 0.071 val loss 0.113 and val accuracy 0.846
train loss 0.067 val loss 0.113 and val accuracy 0.848
train loss 0.063 val loss 0.112 and val accuracy 0.849
train loss 0.060 val loss 0.111 and val accuracy 0.851
train loss 0.058 val loss 0.111 and val accuracy 0.851
train loss 0.056 val loss 0.111 and val accuracy 0.851
train loss 0.054 val loss 0.110 and val accuracy 0.851
train loss 0.053 val loss 0.110 and val accuracy 0.853
train loss 0.051 val loss 0.111 and val accuracy 0.852
train loss 0.050 val loss 0.110 and val accuracy 0.854
train loss 0.049 val loss 0.110 and val accuracy 0.853


In [0]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [0]:
# save_model(model, "853.pth")

In [222]:
update_optimizer(optimizer, 0.005)
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=10)

train loss 0.044 val loss 0.108 and val accuracy 0.855
train loss 0.043 val loss 0.109 and val accuracy 0.855
train loss 0.042 val loss 0.109 and val accuracy 0.855
train loss 0.042 val loss 0.109 and val accuracy 0.855
train loss 0.041 val loss 0.109 and val accuracy 0.855


# Prediction

In [89]:
X_test['left'].shape

(2345796, 245)

In [0]:
for dataset, side in itertools.product([X_test], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

In [38]:
X_test['right'][0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [0]:
class QuoraDataset_test(Dataset):
    def __init__(self,X):
        self.X = X
        
    def __len__(self):
        return self.X['right'].shape[0]
    
    def __getitem__(self, idx):
        x1 = self.X['left'][idx]
        x2 = self.X['right'][idx]
        return x1,x2

In [0]:
test_ds = QuoraDataset_test(X_test)

In [99]:
len(test_ds)

2345796

In [0]:
batch_size_test = 4000
test_dl = DataLoader(test_ds, batch_size=batch_size_test, shuffle=False)

In [43]:
model = Manhattan_LSTM(hidden_size, embeddings, use_embedding=True, train_embedding=True).cuda()
model.load_state_dict(torch.load('/content/853.pth'))
model.eval()

Manhattan_LSTM(
  (embedding): Embedding(121322, 50)
  (lstm): LSTM(50, 10)
)

In [0]:
def get_prediction(model, test_dl):
    y_pred_total = []
    model.eval()
    for x in test_dl:
        x1 = x[0].long().cuda()
        x2 = x[1].long().cuda()
        y_pred = model(x1,x2).cpu().detach().numpy()
        y_pred_total.append(y_pred)
    return y_pred_total

In [0]:
y_pred_total = get_prediction(model,test_dl)

In [106]:
type(y_pred_total)

list

In [0]:
test = np.concatenate(y_pred_total,axis=0)

In [111]:
test.shape

(2345796,)

In [0]:
id = np.arange(0,2345796).tolist()

In [0]:
sample_df = pd.DataFrame({'test_id':id, 'is_duplicate':test})

In [122]:
sample_df.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.400789
1,1,0.519538
2,2,0.218805
3,3,0.000826
4,4,0.050586


In [0]:
sample_df.to_csv("submission.csv",index=False)