In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
def read():
    df = pd.read_csv('D:/Datasets/hackerearth/hm_train.csv')
    df.drop(['reflection_period', 'num_sentence'], axis=1, inplace=True)
    labels = df.predicted_category
    df.drop(['predicted_category'], axis=1, inplace=True)
    return df, labels

In [3]:
df, labels = read()

In [4]:
df.head()

Unnamed: 0,hmid,cleaned_hm
0,27673,I went on a successful date with someone I fel...
1,27674,I was happy when my son got 90% marks in his e...
2,27675,I went to the gym this morning and did yoga.
3,27676,We had a serious talk with some friends of our...
4,27677,I went with grandchildren to butterfly display...


In [5]:
print(len(df), len(labels))

60321 60321


In [6]:
def tokenize(sentence):
    try:
        tokens = nltk.word_tokenize(sentence.lower())
        return tokens
    except:
        return 'NC'

In [7]:
def postprocess(data):
    data['tokens'] = data['cleaned_hm'].progress_map(tokenize)
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    return data

In [8]:
df.head()

Unnamed: 0,hmid,cleaned_hm
0,27673,I went on a successful date with someone I fel...
1,27674,I was happy when my son got 90% marks in his e...
2,27675,I went to the gym this morning and did yoga.
3,27676,We had a serious talk with some friends of our...
4,27677,I went with grandchildren to butterfly display...


In [9]:
df2 = postprocess(df)

progress-bar: 100%|████████████████████████████████████████████████████████████| 60321/60321 [00:08<00:00, 7074.49it/s]


In [10]:
df2.tail()

Unnamed: 0,index,hmid,cleaned_hm,tokens
60316,60316,88299,I got together with my best friend and baked c...,"[i, got, together, with, my, best, friend, and..."
60317,60317,88300,I went to a restaurant with friends,"[i, went, to, a, restaurant, with, friends]"
60318,60318,88301,The other day on Mechanical Turk I made over f...,"[the, other, day, on, mechanical, turk, i, mad..."
60319,60319,88302,Finished the semester today and aced majority ...,"[finished, the, semester, today, and, aced, ma..."
60320,60320,88303,An event that made me happy in the past 3 mont...,"[an, event, that, made, me, happy, in, the, pa..."


In [11]:
len(df2)

60321

In [12]:
df_to_train = df2.drop(['index', 'cleaned_hm'], axis=1)

In [13]:
df_to_train.head()

Unnamed: 0,hmid,tokens
0,27673,"[i, went, on, a, successful, date, with, someo..."
1,27674,"[i, was, happy, when, my, son, got, 90, %, mar..."
2,27675,"[i, went, to, the, gym, this, morning, and, di..."
3,27676,"[we, had, a, serious, talk, with, some, friend..."
4,27677,"[i, went, with, grandchildren, to, butterfly, ..."


In [14]:
df_to_train['labels'] = labels

In [15]:
df_to_train.head()

Unnamed: 0,hmid,tokens,labels
0,27673,"[i, went, on, a, successful, date, with, someo...",affection
1,27674,"[i, was, happy, when, my, son, got, 90, %, mar...",affection
2,27675,"[i, went, to, the, gym, this, morning, and, di...",exercise
3,27676,"[we, had, a, serious, talk, with, some, friend...",bonding
4,27677,"[i, went, with, grandchildren, to, butterfly, ...",affection


In [16]:
x_train, x_test, y_train, y_test = train_test_split(df_to_train, labels, test_size=0.2)

In [17]:
x_train.drop(['labels'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [18]:
x_train.head()

Unnamed: 0,hmid,tokens
54087,82037,"[working, hard, to, develop, my, android, deve..."
2139,29826,"[my, daughter, started, saying, my, baby, daug..."
50328,78265,"[i, found, out, i, got, an, a, in, one, of, my..."
25845,53658,"[i, got, a, surprise, gift, from, my, manger, .]"
5178,32892,"[i, went, to, a, three, hour, painting, class,..."


In [19]:
x_test.drop(['labels'], axis=1, inplace=True)

In [20]:
x_test.head()

Unnamed: 0,hmid,tokens
11140,38881,"[i, met, one, of, my, childhood, friends.we, m..."
400,28073,"[i, listened, to, some, music, and, heard, an,..."
55682,83640,"[we, found, out, that, our, very, sick, cat, w..."
19962,47746,"[made, plans, to, meet, with, a, business, own..."
8942,36674,"[i, saw, a, loon, as, i, was, taking, my, afte..."


In [21]:
y_train.head()

54087         achievement
2139            affection
50328         achievement
25845           affection
5178     enjoy_the_moment
Name: predicted_category, dtype: object

In [22]:
y_train.unique()

array(['achievement', 'affection', 'enjoy_the_moment', 'nature',
       'leisure', 'bonding', 'exercise'], dtype=object)

In [23]:
label_to_cats = {'achievement':      (1, 0, 0, 0, 0, 0, 0),
              'affection':        (0, 1, 0, 0, 0, 0, 0),
              'enjoy_the_moment': (0, 0, 1, 0, 0, 0, 0),
              'nature':           (0, 0, 0, 1, 0, 0, 0),
              'exercise':         (0, 0, 0, 0, 1, 0, 0),
              'bonding':          (0, 0, 0, 0, 0, 1, 0),
              'leisure':          (0, 0, 0, 0, 0, 0, 1)}

In [24]:
cats_to_labels = dict()
for k, v in label_to_cats.items():
    cats_to_labels[v] = k

In [25]:
cats_to_labels

{(1, 0, 0, 0, 0, 0, 0): 'achievement',
 (0, 1, 0, 0, 0, 0, 0): 'affection',
 (0, 0, 1, 0, 0, 0, 0): 'enjoy_the_moment',
 (0, 0, 0, 1, 0, 0, 0): 'nature',
 (0, 0, 0, 0, 1, 0, 0): 'exercise',
 (0, 0, 0, 0, 0, 1, 0): 'bonding',
 (0, 0, 0, 0, 0, 0, 1): 'leisure'}

In [26]:
y_train_temp = []
for label in y_train:
    y_train_temp.append(label_to_cats[label])

In [27]:
y_train_temp

[(1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 0, 0, 1, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 0, 1, 0, 0, 0),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 0, 0, 0, 0, 1, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 1, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0,

In [28]:
y_test_temp = []
for label in y_test:
    y_test_temp.append(label_to_cats[label])

In [29]:
y_test_temp

[(0, 0, 0, 0, 0, 1, 0),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 1, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 1, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 1, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (1, 0, 0, 0, 0, 0, 0),
 (0, 1, 0, 0, 0, 0, 0),
 (0, 0, 1, 0, 0,

In [30]:
y_test.head()

11140        bonding
400          leisure
55682      affection
19962    achievement
8942          nature
Name: predicted_category, dtype: object

In [31]:
def labelize_sentences(sentences, label_type):
    labelized = []
    for i, v in tqdm(enumerate(sentences)):
        label = '%s_%s'%(label_type, i)
        labelized.append(gensim.models.doc2vec.TaggedDocument(v, [label]))
    return labelized

In [32]:
x_train.head()

Unnamed: 0,hmid,tokens
54087,82037,"[working, hard, to, develop, my, android, deve..."
2139,29826,"[my, daughter, started, saying, my, baby, daug..."
50328,78265,"[i, found, out, i, got, an, a, in, one, of, my..."
25845,53658,"[i, got, a, surprise, gift, from, my, manger, .]"
5178,32892,"[i, went, to, a, three, hour, painting, class,..."


In [33]:
w2v_train_corpus = labelize_sentences(x_train.tokens, 'TRAIN')

48256it [00:00, 401196.32it/s]


In [34]:
w2v_test_corpus = labelize_sentences(x_test.tokens, 'TEST')

12065it [00:00, 604923.59it/s]


In [35]:
w2v_train_corpus[0]

TaggedDocument(words=['working', 'hard', 'to', 'develop', 'my', 'android', 'development', 'skills', 'and', 'improve', 'my', 'skill', 'sets', '.'], tags=['TRAIN_0'])

In [36]:
sentence_w2v = Word2Vec(size=200, min_count=3)
sentence_w2v.build_vocab([x.words for x in tqdm(w2v_train_corpus)])
sentence_w2v.train([x.words for x in tqdm(w2v_train_corpus)], total_examples=sentence_w2v.corpus_count, epochs=sentence_w2v.iter)

100%|███████████████████████████████████████████████████████████████████████| 48256/48256 [00:00<00:00, 1592449.52it/s]
100%|███████████████████████████████████████████████████████████████████████| 48256/48256 [00:00<00:00, 2454646.52it/s]
  This is separate from the ipykernel package so we can avoid doing imports until


(3219173, 4941735)

In [37]:
sentence_w2v['working']

  """Entry point for launching an IPython kernel.


array([-0.13627969,  1.0862536 , -1.1832186 ,  0.4868417 , -0.45651573,
        0.2273736 , -1.0656551 ,  0.43759307, -1.0174847 ,  0.25040784,
       -0.37821603,  0.99358493,  0.3007687 , -0.8228288 ,  0.15910502,
       -0.36192277,  0.3156928 ,  0.38336524, -0.74570936, -1.255692  ,
       -0.09377553, -0.72960746, -0.68019855, -0.01661401, -0.3356975 ,
       -0.49957553,  1.4781868 , -0.31053254,  0.2745221 ,  0.47255257,
        0.18497135, -0.18629429,  0.43700284, -0.21834546,  0.19094546,
       -0.18759674,  0.08675408,  0.23170385,  1.1836896 , -0.23421043,
       -0.46637207, -0.69913113, -0.8573378 ,  1.0482931 ,  0.00552048,
       -0.798969  ,  0.31599748, -1.0713449 , -0.24025531, -0.73963904,
        0.63099277,  1.0611733 , -0.9779639 , -0.04044543,  0.92379934,
       -0.1322192 , -0.5472527 , -0.0219525 ,  0.63157874,  0.3639805 ,
       -0.08550104,  0.13490126,  0.8172621 ,  0.18593118,  0.10748109,
       -0.6168246 ,  0.50000685, -0.25596866, -0.5163714 ,  0.25

In [38]:
sentence_w2v.most_similar('good')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('great', 0.8449149131774902),
 ('nice', 0.7065261602401733),
 ('wonderful', 0.6651213765144348),
 ('cool', 0.6419702768325806),
 ('bad', 0.6374666690826416),
 ('productive', 0.6317644119262695),
 ('fantastic', 0.6310884952545166),
 ('enjoyable', 0.6261249780654907),
 ('interesting', 0.6142030954360962),
 ('amazing', 0.6123086214065552)]

In [39]:
sentence_w2v.most_similar('facebook')

  """Entry point for launching an IPython kernel.


[('tinder', 0.7666617035865784),
 ('reddit', 0.7337743639945984),
 ('twitter', 0.7270952463150024),
 ('crush', 0.7212194204330444),
 ('youtube', 0.7147806882858276),
 ('skype', 0.7118726968765259),
 ('clearance', 0.6969553232192993),
 ('horse', 0.6950178146362305),
 ('radio', 0.6880637407302856),
 ('guitar', 0.6875177025794983)]

In [40]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, tools='pan, wheel_zoom, box_zoom, reset, hover, previewsave', x_axis_type=None, y_axis_type=None, min_border=1)

word_vectors = [sentence_w2v[w] for w in list(sentence_w2v.wv.vocab.keys())[:5000]]

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(sentence_w2v.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={'word': '@words'}
show(plot_tfidf)

  


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.040s...
[t-SNE] Computed neighbors for 5000 samples in 9.390s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.136670
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.509506
[t-SNE] KL divergence after 1000 iterations: 2.323405


In [41]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in w2v_train_corpus])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [42]:
len(tfidf)

4017

In [43]:
def build_word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += sentence_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError:
            continue
        break
    
    if count != 0:
        vec /= count
    return vec

In [44]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([build_word_vector(z, 200) for z in tqdm(map(lambda x: x.words, w2v_train_corpus))])
trian_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([build_word_vector(z, 200) for z in tqdm(map(lambda x: x.words, w2v_test_corpus))])
test_vecs_w2v = scale(test_vecs_w2v)

  
48256it [00:00, 67059.10it/s]
  
12065it [00:00, 70930.96it/s]


In [45]:
len(test_vecs_w2v)

12065

In [46]:
len(train_vecs_w2v)

48256

In [47]:
y_train_temp_temp = []
for i in y_train_temp:
    y_train_temp_temp.append(list(i))

In [48]:
y_test_temp_temp = []
for i in y_test_temp:
    y_test_temp_temp.append(list(i))

In [57]:
y_train_temp_temp = np.array(y_train_temp_temp)

In [54]:
y_test_temp_temp = np.array(y_test_temp_temp)

In [55]:
y_test_temp_temp

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [58]:
y_train_temp_temp

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [59]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(7, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train_temp_temp, epochs=9, batch_size=32, verbose=2)

Epoch 1/9
 - 10s - loss: 0.3289 - acc: 0.8657
Epoch 2/9
 - 6s - loss: 0.3192 - acc: 0.8686
Epoch 3/9
 - 6s - loss: 0.3184 - acc: 0.8689
Epoch 4/9
 - 6s - loss: 0.3185 - acc: 0.8694
Epoch 5/9
 - 6s - loss: 0.3185 - acc: 0.8693
Epoch 6/9
 - 6s - loss: 0.3189 - acc: 0.8696
Epoch 7/9
 - 6s - loss: 0.3195 - acc: 0.8695
Epoch 8/9
 - 6s - loss: 0.3195 - acc: 0.8697
Epoch 9/9
 - 6s - loss: 0.3203 - acc: 0.8695


<keras.callbacks.History at 0x2ca98e59438>

In [60]:
score = model.evaluate(test_vecs_w2v, y_test_temp_temp, batch_size=128, verbose=2)

In [61]:
score

[0.7981301815194606, 0.8579598746801678]