In [25]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedKFold

import sys

In [13]:
########################################
## set directories and parameters
########################################
EMBEDDING_FILE = 'wiki.en.bin'
TRAIN_DATA_FILE = 'data/clean_train.csv'
TEST_DATA_FILE = 'data/clean_test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 300000
EMBEDDING_DIM = 300
N_FOLDS = 5
VALIDATION_SPLIT = 0.02

num_lstm = 250
num_dense = 300
rate_drop_lstm = 0.5
rate_drop_dense = 0.4

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [10]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

from gensim.models.wrappers import FastText
embeddings_index = FastText.load_word2vec_format('data/wiki.en.vec')
print('Found %s word vectors of word2vec' % len(embeddings_index.vocab))

Indexing word vectors
Found 2519370 word vectors of word2vec


In [9]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

train_df = pd.read_csv(TRAIN_DATA_FILE, index_col='id')
train_df.fillna('',inplace=True)
train_df['clean_question1'] = train_df['clean_question1'].astype(str)
train_df['clean_question2'] = train_df['clean_question2'].astype(str)

texts_1 = list(train_df['clean_question1'])
texts_2 = list(train_df['clean_question2'])
labels = np.array(train_df['is_duplicate'])
print('Found %s texts in train.csv' % len(texts_1))

test_df = pd.read_csv(TEST_DATA_FILE, index_col='test_id')
test_df.fillna('',inplace=True)
test_df['clean_question1'] = test_df['clean_question1'].astype(str)
test_df['clean_question2'] = test_df['clean_question2'].astype(str)

test_texts_1 = list(test_df['clean_question1'])
test_texts_2 = list(test_df['clean_question2'])
test_ids = test_df.index
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Processing text dataset
Found 404290 texts in train.csv
Found 2345796 texts in test.csv
Found 120499 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [12]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index[word]
    except:
        embedding_vector = None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 27016


In [11]:
def load_features(data_type, features, typ="clean"):
    
    data_feat = []
    id_col = 'id'
    if data_type == 'test':
        id_col = 'test_id'
    for feature in features:
        try:
            feat = pd.read_csv('features/{}_{}_{}.csv'.format(typ, data_type, feature),index_col=id_col)
        except:
            feat = pd.read_csv('features/{}_{}_{}.csv'.format(typ, data_type, feature))
        cols = list(feat.columns)
        feat.columns = list(map(lambda x: feature+x,cols))
        data_feat.append(feat)
        
    return pd.concat(data_feat,axis=1)

In [38]:
clean_train_features = load_features('train', ['distance', 
                                               'word_idf_features', 
                                               'word_share_features', 
                                               'gensim_sent_features', 
                                               'error_features',
                                               'sent'])

magic_features = load_features('train',['magic_features_freq',
                                        'magic_features_graph', 
                                        'magic_features_graph_cc',
                                        'magic_features_wm',
                                        'magic_features_pr',
                                        'locations'], typ='raw')

#oof_features = load_features('train',['oof_2'], typ='raw')

stem_train_features = load_features('train', ['tfidf_lsa_50_features'], typ='stem')

lem_train_features = load_features('train', ['word_pos_features'], typ='lem')

x_train = pd.concat([clean_train_features, stem_train_features, 
                    magic_features, lem_train_features], axis=1)

del clean_train_features, magic_features, stem_train_features, lem_train_features

In [42]:
clean_test_features = load_features('test', ['distance', 
                                             'word_idf_features', 
                                             'word_share_features',
                                             'gensim_sent_features',
                                             'error_features',
                                             'sent'])

magic_test_features = load_features('test', ['magic_features_freq',
                                             'magic_features_graph', 
                                             'magic_features_graph_cc',
                                             'magic_features_wm',
                                             'magic_features_pr',
                                             'locations'], typ='raw')

stem_test_features = load_features('test', ['tfidf_lsa_50_features'], typ='stem')

lem_test_features = load_features('test', ['word_pos_features'], typ='lem')

x_test = pd.concat([clean_test_features, stem_test_features, 
                    magic_test_features, lem_test_features], axis=1)

del clean_test_features, magic_test_features, stem_test_features, lem_test_features

In [43]:
from sklearn.preprocessing import StandardScaler

data_columns = x_train.columns
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)
x_train['gensim_sent_featuresclean_norm_wmd'] = x_train['gensim_sent_featuresclean_norm_wmd'].replace(np.inf, 1.48)
x_test['gensim_sent_featuresclean_norm_wmd'] = x_test['gensim_sent_featuresclean_norm_wmd'].replace(np.inf, 1.48)
x_train['gensim_sent_featuresclean_wmd'] = x_train['gensim_sent_featuresclean_wmd'].replace(np.inf, 6.2)
x_test['gensim_sent_featuresclean_wmd'] = x_test['gensim_sent_featuresclean_wmd'].replace(np.inf, 6.2)


ss = StandardScaler()
ss.fit(np.vstack((x_train, x_test)))
x_train = ss.transform(x_train)
x_test = ss.transform(x_test)

x_train = pd.DataFrame(data=x_train, columns=data_columns)
x_test = pd.DataFrame(data=x_test, columns=data_columns)

In [44]:
########################################
## sample train/validation data
########################################

np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))

data_3_train = np.vstack((x_train.values[idx_train],x_train.values[idx_train]))
data_3_val = np.vstack((x_train.values[idx_val],x_train.values[idx_val]))

labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))


weight_val = np.ones(len(labels_val))

if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344
    class_weight = {0: 1.309028344, 1: 0.472001959}

In [45]:
########################################
## define the model structure
########################################
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer =  Bidirectional(LSTM(num_lstm, \
                                 dropout=rate_drop_lstm, \
                                 recurrent_dropout=rate_drop_lstm))

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

z1 = Input(shape=(x_train.shape[1],), dtype='float32')
z1_dense = Dense(num_dense//2, activation=act)(z1)

merged = concatenate([x1, y1, z1_dense])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [48]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, z1], \
        outputs=preds)

bst_model_path = 'lstms/FastTest_lstm_250_350_0.55_0.45_74dims.h5'

model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)


model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss',save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, data_3_train], labels_train, \
        validation_data=([data_1_val, data_2_val, data_3_val], labels_val, weight_val), \
        epochs=25, batch_size=1356, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

lstm_250_300_0.50_0.40
Train on 792408 samples, validate on 16172 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200


In [None]:
model.load_weights(bst_model_path)

preds = model.predict([test_data_1, test_data_2, x_test.values], batch_size=1200, verbose=1)
preds += model.predict([test_data_2, test_data_1, x_test.values], batch_size=1200, verbose=1)
preds /= 2

STAMP = '_lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

test_ids = np.arange(len(preds))
submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('submissions/'+'FastText_44dims_'+ '%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

In [50]:
submission.to_csv('submissions/'+'FastText_44dims_'+ '%.4f_'%(1637)+STAMP+'.csv', index=False)