In [57]:
import numpy as np
import pandas as pd

### Data Exploratory Analysis

In [58]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [None]:
train_df

In [59]:
train_df['target'].values

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
from sklearn.metrics import f1_score

In [12]:
train_df.question_text.str.len().describe().round(1)

count    1306122.0
mean          70.7
std           38.8
min            1.0
25%           45.0
50%           60.0
75%           85.0
max         1017.0
Name: question_text, dtype: float64

In [49]:
test_df.question_text.str.len().describe().round(1)

count    56370.0
mean        70.5
std         38.7
min         11.0
25%         45.0
50%         60.0
75%         85.0
max        588.0
Name: question_text, dtype: float64

In [33]:
from nltk import word_tokenize

In [35]:
train_df[train_df.question_text.str.len() > 100].target.value_counts()

0    187558
1     32373
Name: target, dtype: int64

In [36]:
train_df['char_length'] = train_df.question_text.str.len()

In [53]:
test_df[test_df.question_text.str.len() < 20].question_text.head()

16        What is fapping?
632    What is NuAge Skin?
732          Is UPSE good?
907        How can I post?
973    Who created idioms?
Name: question_text, dtype: object

In [62]:
positive = train_df[train_df.target == 1]

In [20]:
positive.sample(1).iloc[0][1]

'Why are israelis destroying churches/mosques in Israel? Where is the outrage?'

In [52]:
positive.head()

Unnamed: 0,qid,question_text,target,char_length
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1,67
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1,86
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1,102
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1,137
115,000537213b01fd77b58a,Which races have the smallest penis?,1,36


In [43]:
train_df.iloc[107]

qid                                           000485e6dd4b149fe051
question_text    Can I start freelancing after finishing Udacit...
target                                                           0
char_length                                                     75
Name: 107, dtype: object

In [44]:
train_df[train_df.question_text.str.startswith('What is the meaning of')].head()

Unnamed: 0,qid,question_text,target,char_length
401,0012d9af133219e1b370,What is the meaning of relationship with a gir...,0,62
853,0028f67ccff988aee531,What is the meaning of nudge in hike?,0,37
1341,004299fc61797c2c9895,What is the meaning of life according to Chris...,0,74
1958,006192347b2ba501281b,"What is the meaning of the word ""Analhak""?",0,42
3586,00b30616acc5be7a2b19,"What is the meaning of ""ask of""?",0,32


In [28]:
del train_df, test_df

### Baseline CNN

In [29]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import time

In [30]:
max_features=95000
maxlen=70
embed_size=300

In [77]:
def data_preparation(predict=False, sub_train=True):
    start_time = time.time()
    train_df = pd.read_csv("./train.csv")
    if sub_train:
        train_df = train_df.sample(frac=0.3)
    print("Train shape : ",train_df.shape)
    if predict:
        test_df = pd.read_csv("./test.csv")
        print("Test shape : ",test_df.shape)
        test_X = test_df["question_text"].fillna("_##_").values    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)
    
    ## fill up the missing values
    train_X = train_df["question_text"].values
    val_X = val_df["question_text"].values

    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    print('fitting text to tokenizer..')
    check_point1 = time.time()
    tokenizer.fit_on_texts(list(train_X))
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    check_point2 = time.time()
    print('fitting took {:.2f} seconds to finish'.format(check_point2 - check_point1))
#     save_text_tokenizer(tokenizer, "tokenizer")
    
    print('transforming text to sequence of word indices..')
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    check_point3 = time.time()
    print('transforming took {:.2f} seconds to finish'.format(check_point3 - check_point2))
    if predict:
        test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    print('padding sentence to the same length..')
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    check_point4 = time.time()
    print('padding took {:.2f} seconds to finish'.format(check_point4 - check_point3))
    
    if predict:
        test_X = pad_sequences(test_X, maxlen=maxlen)
        
    print('it took {:.2f} seconds to finish data prepartation'.format(time.time() - start_time))

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
#     trn_idx = np.random.permutation(len(train_X))
#     val_idx = np.random.permutation(len(val_X))

#     train_X = train_X[trn_idx]
#     val_X = val_X[val_idx]
#     train_y = train_y[trn_idx]
#     val_y = val_y[val_idx]    
    
    if predict:
        return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index, tokenizer, val_df
    else:
        return train_X, val_X, train_y, val_y, tokenizer.word_index

In [78]:
train_X, val_X, test_X, train_y, val_y, word_index, tokenizer, val_df = data_preparation(predict=True)

Train shape :  (391837, 3)
Test shape :  (56370, 2)
fitting text to tokenizer..
Found 202140 unique tokens.
fitting took 9.27 seconds to finish
transforming text to sequence of word indices..
transforming took 8.00 seconds to finish
padding sentence to the same length..
padding took 4.37 seconds to finish
it took 27.01 seconds to finish data prepartation


In [133]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [54]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [98]:
def load_glove(word_index, embedding_fname='glove.840B.300d.txt'):
    EMBEDDING_FILE = './glove.840B.300d/' + embedding_fname
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [104]:
embedding_matrix1 = load_glove(word_index)
# , embedding_fname='glove.6B.50d.txt'

A Jupyter Widget




In [38]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate, Lambda
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

In [88]:
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [108]:
model = model_cnn(embedding_matrix1)

for e in range(2):
    model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
    pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(val_y, (pred_val_y > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score

    print("Val F1 Score: {:.4f}".format(best_score))

# pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)

Train on 360490 samples, validate on 31347 samples
Epoch 1/1
Val F1 Score: 0.6191
Train on 360490 samples, validate on 31347 samples
Epoch 1/1




Val F1 Score: 0.6239


NameError: name 'test_X' is not defined

In [80]:
pred_val_y = model2.predict([val_X], batch_size=1024, verbose=0)

In [81]:
mask = [each[0] for each in (pred_val_y > 0.19)]

In [83]:
val_df[mask].to_csv('val_pred.csv')

In [112]:
model.predict(test_X)

array([[ 0.00393321],
       [ 0.00129482],
       [ 0.00010186],
       ..., 
       [ 0.0008012 ],
       [ 0.00826971],
       [ 0.01254148]], dtype=float32)

In [45]:
from keras.models import load_model

# model.save('my_model2.h5', 'w') 

In [46]:
model2 = load_model('my_model2.h5')

In [49]:
res = model2.predict(test_X)

In [65]:
mask = [each[0] for each in (res > 0.19)]

In [53]:
res.shape

(56370, 1)

In [43]:
test_X.shape

(56370, 70)

### best threshold for cutting the line

In [134]:
best_thresh

0.19

### working function for prediction, it is not fast, though

In [54]:
def predict_label(query='What is happiness?', maxlen = 70, best_thresh=0.19):
    from keras.models import load_model
    from keras.preprocessing.sequence import pad_sequences
    import pickle
    import time
    start = time.time()
    # loading
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    test_X = tokenizer.texts_to_sequences([query])
    test_X = pad_sequences(test_X, maxlen=maxlen)
    model = load_model('my_model2.h5')
    score = model.predict(test_X)
    print('took {:.2f} seconds to finish'.format(time.time() - start))
    print(score)
    if score[0] > best_thresh:
        return 1
    else:
        return 0

In [55]:
predict_label('Which races have the smallest penis?	')

took 6.27 seconds to finish
[[ 0.00302711]]


0

## highlight words

### TF-IDF

In [59]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
corpus = list(positive.question_text)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [64]:
feature_names = vectorizer.get_feature_names()

In [65]:
vectorizer.vocabulary_['fuck']

13814

In [66]:
from random import randint

In [67]:
doc = randint(0, 80810)
feature_index = X[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])

for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

are 0.06724498987414163
to 0.17567827433455765
why 0.05241173128141058
is 0.0636212093984737
do 0.06419208940178942
it 0.08115724057937664
considered 0.16891314396138082
so 0.09124197458508662
now 0.14474915181836895
people 0.08397287609637323
many 0.11790505761367612
think 0.1156672194710421
good 0.1430831844359977
old 0.14881925841170807
health 0.19960560754904966
year 0.3061701228726695
26 0.49483210960155244
olds 0.254549686239098
officially 0.2525913039829496
idea 0.1904335041601847
teenagers 0.22978386340895543
allow 0.17707908131054775
insurance 0.23538596701197168
stick 0.22774087517639086
mommy 0.27860047670037996


### Logistic Regression

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = list(train_df.question_text)
cvectorizer = CountVectorizer(ngram_range=(1,2))
XL = cvectorizer.fit_transform(corpus)

In [69]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(C=1.0)
lg.fit(XL, list(train_df.target))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
len(lg.coef_[0])

3195271

In [71]:
features = cvectorizer.get_feature_names()

In [72]:
feature_index = XL[397,:].nonzero()[1]

In [73]:
lg_scores = list(zip(features, lg.coef_[0]))

In [74]:
positive.sample(1).iloc[0]['question_text']

'Do all Muslims think non-Muslims go to hell?'

In [75]:
train_df.iloc[22]['question_text']

'Has the United States become the largest dictatorship in the world?'

In [76]:
[lg_scores[x] for x in feature_index]

[('do', -0.05765709028742363),
 ('do moms', 0.4288182345969702),
 ('have', 0.03596763207339017),
 ('have sex', 0.38521374112419804),
 ('moms', 1.0969324090070018),
 ('moms have', 0.3057235643949789),
 ('sex', 1.009243853334876),
 ('sex with', 1.6025979871460407),
 ('sons', 0.6369177775802709),
 ('their', 0.40886481490242),
 ('their sons', 0.6909477098510638),
 ('with', 0.005402496547504425),
 ('with their', -0.008730347028346705)]

## logistic dictionary

In [77]:
score_dict = dict(lg_scores)

In [78]:
cvectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [79]:
import pickle

with open('lg_score.pickle', 'wb') as handle:
    pickle.dump(score_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [80]:
with open('count_vectorizer.pickle', 'wb') as handle:
    pickle.dump(cvectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [81]:
with open('lg_coef.pickle', 'wb') as handle:
    pickle.dump(lg.coef_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [82]:
import pickle
with open('count_vectorizer.pickle', 'rb') as handle:
    cv = pickle.load(handle)
with open('lg_coef.pickle', 'rb') as handle:
    lg_coef = pickle.load(handle)
    
transformed_query = cv.transform(['Has the United States become the largest dictatorship in the world?'])
indices = transformed_query.nonzero()[1]
coef_list = lg_coef[0][indices]
largest_n = coef_list.argsort()[-3:][::-1]

In [87]:
np.array(cv.get_feature_names())[indices]

array(['become', 'become the', 'dictatorship', 'dictatorship in', 'has',
       'has the', 'in', 'in the', 'largest', 'largest dictatorship',
       'states', 'states become', 'the', 'the largest', 'the united',
       'the world', 'united', 'united states', 'world'], dtype='<U236')

In [None]:
import pickle

In [88]:
def scoring_words(query='Has the United States become the largest dictatorship in the world?'):
    # preprocessing
    from nltk import word_tokenize
    tokens = word_tokenize(query.lower())
    bigram = [tokens[ii]+' '+tokens[ii+1] for ii in range()]
    
    with open('../data/lg_score1.pickle', 'rb') as handle:
        b = pickle.load(handle)
        
    scores = [b[t] if t in b else 0 for t in tokens]

#     import numpy as np
#     arr = np.array(scores)
#     indices = arr.argsort()[-5:][::-1]
    print(scores)

    indices = [scores.index(ii) for ii in scores if ii >= 1.0]
    
    words = list(set([tokens[i] for i in indices]))
    print(words)
    ans = []
    for ind, ii in enumerate(query.lower().split()):
        for jj in words:
            if jj in ii and len(ii) - len(jj) <= 1:
                ans.append(ind)
    return ans

In [89]:
scoring_words("Why don't USA citizens realize that Trump is rapidly doing what terrorists could not, i.e., push the country towards irrevocable catastrophe?")

[0.516075785049279, -0.05765709028742363, 0, 0.2537544305507007, 0.5821064684668563, 0.43686076674634394, 0.10958945321485326, 1.7259926004064412, -0.016849850760210776, 0.3445836623080487, -0.18207464419112712, -0.8467612428737705, 1.8839439282359793, -0.34902953204465775, 0.12487766041081051, 0, 0, 0, 0.050726782377775885, 0.2005054389120148, 0.3349759906450282, -0.1460054169221432, -0.04507660825735099, -0.43625239740076927, 0]
['terrorists', 'trump']


[6, 11]

In [None]:
scoring_words("Why don't poor countries print more money to use for paying for education, etc.?")