## The notebook is run on 30k data. However, the results in the report are generated by 100k data which are run on alan server.

In [2]:
# import packages
from random import sample
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import random
import time
from tqdm import tqdm
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.corpus import stopwords 
import string
from gensim.models import KeyedVectors
from wordcloud import WordCloud, STOPWORDS
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Sequential, Model
from keras import initializers, regularizers, constraints, optimizers, layers
stemmer = SnowballStemmer("english")
def stemmed_words(doc): return (stemmer.stem(w) for w in analyzer(doc))

Using TensorFlow backend.


In [3]:
# some configuration
B = 100000
seed = 1024

In [4]:
# read in data
train = pd.read_csv('new_train.csv')

In [5]:
train_sub = train.sample(n = B, random_state = seed)
print(train_sub.shape)

(30000, 12)


In [6]:
# Use TF-IDF matrix
analyzer = TfidfVectorizer().build_analyzer()
tfidf_vectorizer=TfidfVectorizer(analyzer=stemmed_words)

Tfidf_train_vector=tfidf_vectorizer.fit_transform(train_sub.loc[:,"question_text"])

Tfidf_train_df = pd.DataFrame(Tfidf_train_vector.toarray(), columns=tfidf_vectorizer.get_feature_names())

Tfidf_train = Tfidf_train_df.to_numpy()


In [7]:
# combine the matrix with the newly constructed features in EDA part.
new_feature_train = train_sub.loc[:,["num_words","num_unique_words","num_punctuations","num_words_upper","num_words_title","mean_word_len"]].to_numpy()

train_X = np.concatenate((Tfidf_train, new_feature_train),axis=1)


In [8]:
# train validation and test split
X_tr, X_val, y_tr, y_val = train_test_split(train_X, train_sub['target'], test_size=0.3, random_state=0)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((21000, 20167), (9000, 20167), (21000,), (9000,))

In [9]:
C = np.array([0.1,1,10,50,100])
C_score = np.zeros(len(C))
C_f1score = np.zeros(len(C))

In [10]:
for c,i in zip(C,np.arange(len(C))):
    lreg = LogisticRegression(solver = 'liblinear', penalty='l1', C=c).fit(X_tr, y_tr)
    C_score[i] = lreg.score(X_val, y_val)
    C_f1score[i] = f1_score(lreg.predict(X_val),y_val)

In [11]:
print("The F1 score for Logistic Regression with C = [0.1, 1, 10, 50, 100], is {}".format(C_f1score))

The F1 score for Logistic Regression with C = [0.1, 1, 10, 50, 100], is [0.14710485 0.41826923 0.44989775 0.41722488 0.40607211]


# Gaussian Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
mnb = GaussianNB().fit(X_tr, y_tr)
f1score_GNB = f1_score(gnb.predict(X_val), y_val)
print("The F1 score for Gaussian Naive Bayes is {}".format(f1score_MNB))

ValueError: could not convert string to float: '622ae9e6e2f4c8e2fb71'

## Unpretrained BiLSTM

In [14]:
## split to train and val
X_tr, X_val, y_tr, y_val = train_test_split(train_sub, train_sub['target'], test_size=0.3, random_state=0)
print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = X_tr["question_text"].fillna("_na_").values
val_X = X_val["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

## Get the target values
train_y = y_tr.values
val_y = y_val.values

(21000, 12) (9000, 12) (21000,) (9000,)


In [15]:
# Jue's newly constructed NN model
model = Sequential()
model.add(Embedding(max_features, embed_size, input_length=maxlen))
model.add(Bidirectional(LSTM(40,dropout=0.2, recurrent_dropout=0.3,return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.3))
model.add(Dense(20,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_X, train_y, batch_size=1024, epochs=4, validation_data=(val_X, val_y))

pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 80)           109120    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 80)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1620      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 21000 samples, validate on 9000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
F1 score at threshold 0.1 is 0.3698296836982968
F1 score at threshold 0.11 is 0.2868965517241379
F1 score at threshold 0.12 is 0.20512820512820512
F1 score at threshold 0.13 is 0.13759999999999997
F1 score at threshold 0.14 is 0.11111111111111112
F1 score at threshold 0.15 is 0.07679465776293824
F1 score at threshold 0.16 is 0.04753820033955857
F1 score at threshold 0.17 is 0.027538726333907054
F1 score at threshold 0.18 is 0.006968641114982579
F1 score at threshold 0.19 is 0.006980802792321117
F1 score at threshold 0.2 is 0.003496503496503496
F1 score at threshold 0.21 is 0.003496503496503496
F1 score at threshold 0.22 is 0.0
F1 score at threshold 0.23 is 0.0
F1 score at threshold 0.24 is 0.0
F1 score at threshold 0.25 is 0.0
F1 score at threshold 0.26 is 0.0
F1 score at threshold 0.27 is 0.0
F1 score at threshold 0.28 is 0.0
F1 score at threshold 0.29 is 0.0
F1 score at threshold 0.3 is 0.0
F1 sc

## Pretrained BiRNN

In [16]:
EMBEDDING_FILE = './embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary= True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix[i] = embedding_vector
        

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(40,dropout=0.2, recurrent_dropout=0.3, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.3)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_X, train_y, batch_size=1024, epochs=4, validation_data=(val_X, val_y))
pred_word2vec_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("For Word2Vec, F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_word2vec_val_y>thresh).astype(int))))

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 80)           109120    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 80)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                1620      
_________________________________________________________________
dropout_4 (Dropout)          (None, 20)                0   

In [17]:
EMBEDDING_FILE = './embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(40,dropout=0.2, recurrent_dropout=0.3, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.3)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_X, train_y, batch_size=1024, epochs=4, validation_data=(val_X, val_y))
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("For Glove, F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

  if (await self.run_code(code, result,  async_=asy)):


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 80)           109120    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 80)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                1620      
_________________________________________________________________
dropout_6 (Dropout)          (None, 20)                0   

In [18]:
EMBEDDING_FILE = './embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(40,dropout=0.2, recurrent_dropout=0.3, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.3)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, batch_size=1024, epochs=4, validation_data=(val_X, val_y))
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("For fasttext, F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

Train on 21000 samples, validate on 9000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
For fasttext, F1 score at threshold 0.1 is 0.449438202247191
For fasttext, F1 score at threshold 0.11 is 0.4606879606879607
For fasttext, F1 score at threshold 0.12 is 0.4726536124240379
For fasttext, F1 score at threshold 0.13 is 0.4857768052516411
For fasttext, F1 score at threshold 0.14 is 0.48857368006304175
For fasttext, F1 score at threshold 0.15 is 0.49289891395154556
For fasttext, F1 score at threshold 0.16 is 0.48586572438162545
For fasttext, F1 score at threshold 0.17 is 0.47955390334572495
For fasttext, F1 score at threshold 0.18 is 0.4683794466403162
For fasttext, F1 score at threshold 0.19 is 0.44720496894409945
For fasttext, F1 score at threshold 0.2 is 0.4312026002166847
For fasttext, F1 score at threshold 0.21 is 0.4173318129988598
For fasttext, F1 score at threshold 0.22 is 0.3933253873659118
For fasttext, F1 score at threshold 0.23 is 0.3615960099750623
For fasttext, F1 score at t

In [19]:
EMBEDDING_FILE = './embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(40,dropout=0.2, recurrent_dropout=0.3, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.3)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, batch_size=1024, epochs=4, validation_data=(val_X, val_y))
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("For paragram, F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

Train on 21000 samples, validate on 9000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
For paragram, F1 score at threshold 0.1 is 0.4797092671108419
For paragram, F1 score at threshold 0.11 is 0.4778987828315182
For paragram, F1 score at threshold 0.12 is 0.4903397734843438
For paragram, F1 score at threshold 0.13 is 0.49582172701949856
For paragram, F1 score at threshold 0.14 is 0.4974765681326604
For paragram, F1 score at threshold 0.15 is 0.494415487714073
For paragram, F1 score at threshold 0.16 is 0.5042405551272167
For paragram, F1 score at threshold 0.17 is 0.5094936708860759
For paragram, F1 score at threshold 0.18 is 0.5094806265457542
For paragram, F1 score at threshold 0.19 is 0.5161290322580646
For paragram, F1 score at threshold 0.2 is 0.5132275132275133
For paragram, F1 score at threshold 0.21 is 0.510948905109489
For paragram, F1 score at threshold 0.22 is 0.5004686035613871
For paragram, F1 score at threshold 0.23 is 0.4932821497120921
For paragram, F1 score at thresh