In [36]:
import numpy as np
import numpy.random as rnd

import pandas as pd
import os
import sys
import codecs
import csv
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [97]:
import tensorflow as tf
import datetime

# Load and Preprocessing

In [47]:
TRAIN_PATH = './data/train.csv'
TEST_PATH = './data/test.csv'
EMBEDDING = './data/GoogleNews-vectors-negative300.bin'

In [65]:
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 300
MAX_NB_WORDS = 200000
VALIDATION_SPLIT = 0.1

In [30]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    text = text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    return(text)

In [39]:
def read_train_data(path):
    texts_1 = [] 
    texts_2 = []
    labels = []
    with codecs.open(path, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        header = next(reader)
        for values in reader:
            texts_1.append(text_to_wordlist(values[3]))
            texts_2.append(text_to_wordlist(values[4]))
            labels.append(int(values[5]))
    print('Found %s texts in train.csv' % len(texts_1))
    return (texts_1, texts_2, labels)

In [45]:
def read_test_data(path):
    test_texts_1 = []
    test_texts_2 = []
    test_ids = []
    with codecs.open(path, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        header = next(reader)
        for values in reader:
            test_texts_1.append(text_to_wordlist(values[1]))
            test_texts_2.append(text_to_wordlist(values[2]))
            test_ids.append(values[0])
    print('Found %s texts in test.csv' % len(test_texts_1))
    return (test_texts_1, test_texts_2, test_ids)

In [40]:
train_1, train_2, labels = read_train_data(TRAIN_PATH)

Found 404290 texts in train.csv


In [46]:
test_1, test_2, ids = read_test_data(TEST_PATH)

Found 2345796 texts in test.csv


In [48]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING, binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Found 3000000 word vectors of word2vec


In [56]:
def prepare_data(train_1, train_2, test_1, test_2, labels, ids, num_words=200000):
    
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(train_1 + train_2 + test_1 + test_2)
    
    sequences_1 = tokenizer.texts_to_sequences(train_1)
    sequences_2 = tokenizer.texts_to_sequences(train_2)
    
    test_sequences_1 = tokenizer.texts_to_sequences(test_1)
    test_sequences_2 = tokenizer.texts_to_sequences(test_2)

    data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
    data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
    labels = np.array(labels)
    print('Shape of data tensor:', data_1.shape)
    print('Shape of label tensor:', labels.shape)

    test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
    test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
    ids = np.array(ids)
    
    return (tokenizer, {'train_1' : data_1, 'train_2' : data_2, 'labels' : labels,\
                             'test_1' : test_data_1, 'test_2' : test_data_2, 'ids' : ids})

In [57]:
tokenizer, data = prepare_data(train_1, train_2, test_1, test_2, labels, ids)

Found 120499 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [69]:
def build_emb_matrix(data, tokenizer):
    word_index = tokenizer.word_index
    nb_words = min(MAX_NB_WORDS, len(word_index))+1

    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix, nb_words

In [70]:
emb, nb_words = build_emb_matrix(data, tokenizer)

Null word embeddings: 61789


In [67]:
def split_data(data):
    perm = np.random.permutation(len(data['train_1']))
    idx_train = perm[:int(len(data['train_1'])*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data['train_1'])*(1-VALIDATION_SPLIT)):]

    data_1_train = np.vstack((data['train_1'][idx_train], data['train_2'][idx_train]))
    data_2_train = np.vstack((data['train_2'][idx_train], data['train_2'][idx_train]))
    labels_train = np.concatenate((data['labels'][idx_train], data['labels'][idx_train]))

    data_1_val = np.vstack((data['train_1'][idx_val], data['train_2'][idx_val]))
    data_2_val = np.vstack((data['train_2'][idx_val], data['train_1'][idx_val]))
    labels_val = np.concatenate((data['labels'][idx_val], data['labels'][idx_val]))
    return {'data_1_train' : data_1_train, 'data_2_train' : data_2_train, 'labels_train' : labels_train, \
           'data_1_val' : data_1_val, 'data_2_val' : data_2_val, 'labels_val' : labels_val}

In [68]:
split_data = split_data(data)

In [72]:
params = {'emb_matrix' : emb, 'nb_words' : nb_words, 'num_lstm' : 100, 'drop_out' : 0.2, 'num_dense' : 120}

# Model (Siamese Net)

In [75]:
def build_model(params):
    
    embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[params['emb_matrix']],
        input_length=MAX_SEQUENCE_LENGTH, trainable=False)
    lstm_layer = LSTM(params['num_lstm'], dropout=params['drop_out'], recurrent_dropout=params['drop_out'])

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(params['drop_out'])(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(params['num_dense'], activation='relu')(merged)
    merged = Dropout(params['drop_out'])(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [76]:
model = build_model(params)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 300)      36150000    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)          160400      embedding_2[0][0]                
          

In [79]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = './models/best.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [None]:
model.fit([split_data['data_1_train'], split_data['data_2_train']], split_data['labels_train'], \
        validation_data=([split_data['data_1_val'], split_data['data_2_val']], split_data['labels_val']), \
        epochs=100, batch_size=1024, shuffle=True, callbacks=[early_stopping, model_checkpoint])

In [None]:
model.load_weights(bst_model_path)

preds = model.predict([data['test_1'], data['test_2']], batch_size=1024, verbose=1)

submission = pd.DataFrame({'test_id':data['ids'], 'is_duplicate':preds.ravel()})
submission.to_csv('./submissions/' + str(datetime.datetime.now()).split('.')[0] + '.csv', index=False)