In [1]:
import pandas as pd
import numpy as np
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
def read():
    df = pd.read_csv('D:/Datasets/hackerearth/hm_train.csv')
    df.drop(['num_sentence'], axis=1, inplace=True)
    labels = df.predicted_category
    df.drop(['predicted_category'], axis=1, inplace=True)
    return df, labels

In [3]:
df, labels = read()
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,27673,24h,I went on a successful date with someone I fel...
1,27674,24h,I was happy when my son got 90% marks in his e...
2,27675,24h,I went to the gym this morning and did yoga.
3,27676,24h,We had a serious talk with some friends of our...
4,27677,24h,I went with grandchildren to butterfly display...


In [4]:
print(len(df), len(labels))

60321 60321


In [5]:
def tokenize(sentence):
    tok = nltk.word_tokenize(sentence.lower())
    tokens = [w for w in tok if w not in stopwords.words('english')]
#     for w in tok:
#         if w not in stopwords.words('english'):
#             tokens.append(w)
    return tokens

In [6]:
def postprocess(data):
    data['tokens'] = data['cleaned_hm'].progress_map(tokenize)
    data.reset_index(inplace=True)
    return data

In [7]:
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,27673,24h,I went on a successful date with someone I fel...
1,27674,24h,I was happy when my son got 90% marks in his e...
2,27675,24h,I went to the gym this morning and did yoga.
3,27676,24h,We had a serious talk with some friends of our...
4,27677,24h,I went with grandchildren to butterfly display...


In [8]:
df2 = postprocess(df)

progress-bar: 100%|█████████████████████████████████████████████████████████████| 60321/60321 [04:05<00:00, 245.77it/s]


In [9]:
df2.tail()

Unnamed: 0,index,hmid,reflection_period,cleaned_hm,tokens
60316,60316,88299,3m,I got together with my best friend and baked c...,"[got, together, best, friend, baked, cupcakes,..."
60317,60317,88300,3m,I went to a restaurant with friends,"[went, restaurant, friends]"
60318,60318,88301,3m,The other day on Mechanical Turk I made over f...,"[day, mechanical, turk, made, fifty, dollars, .]"
60319,60319,88302,3m,Finished the semester today and aced majority ...,"[finished, semester, today, aced, majority, te..."
60320,60320,88303,3m,An event that made me happy in the past 3 mont...,"[event, made, happy, past, 3, months, went, me..."


In [10]:
len(df2)

60321

In [11]:
df_to_train = df2.drop(['index', 'cleaned_hm'], axis=1)

In [12]:
df_to_train.head()

Unnamed: 0,hmid,reflection_period,tokens
0,27673,24h,"[went, successful, date, someone, felt, sympat..."
1,27674,24h,"[happy, son, got, 90, %, marks, examination]"
2,27675,24h,"[went, gym, morning, yoga, .]"
3,27676,24h,"[serious, talk, friends, flaky, lately, ., und..."
4,27677,24h,"[went, grandchildren, butterfly, display, croh..."


In [13]:
df_to_train['24h'] = pd.to_numeric(df['reflection_period'] == '24h')
df_to_train['3m'] = pd.to_numeric(df['reflection_period'] == '3m')

In [14]:
df_to_train.head()

Unnamed: 0,hmid,reflection_period,tokens,24h,3m
0,27673,24h,"[went, successful, date, someone, felt, sympat...",True,False
1,27674,24h,"[happy, son, got, 90, %, marks, examination]",True,False
2,27675,24h,"[went, gym, morning, yoga, .]",True,False
3,27676,24h,"[serious, talk, friends, flaky, lately, ., und...",True,False
4,27677,24h,"[went, grandchildren, butterfly, display, croh...",True,False


In [15]:
df_to_train.drop(['reflection_period'], axis=1, inplace=True)

In [16]:
df_to_train.head()

Unnamed: 0,hmid,tokens,24h,3m
0,27673,"[went, successful, date, someone, felt, sympat...",True,False
1,27674,"[happy, son, got, 90, %, marks, examination]",True,False
2,27675,"[went, gym, morning, yoga, .]",True,False
3,27676,"[serious, talk, friends, flaky, lately, ., und...",True,False
4,27677,"[went, grandchildren, butterfly, display, croh...",True,False


In [17]:
x_train, x_test, y_train, y_test = train_test_split(df_to_train, labels, test_size=0.2)
x_train.head()

Unnamed: 0,hmid,tokens,24h,3m
60088,88070,"[able, plan, vacation, summer]",False,True
39751,67644,"[happiest, day, past, three, months, wedding, ...",False,True
25149,52957,"[dog, walk, politely, next, walk, ,, without, ...",True,False
13826,41583,"[first, nice, ,, long, phone, conversation, hi...",True,False
37980,65865,"[decided, begin, new, business, .]",False,True


In [18]:
x_test.head()

Unnamed: 0,hmid,tokens,24h,3m
11836,39581,"[bought, ton, games, steam, got, cheap, .]",True,False
48495,76426,"[happy, wife, pleased, baby, shower, parents, ...",False,True
6532,34252,"[friend, 's, threw, surprise, birthday, dinner...",True,False
47650,75576,"[excited, passed, ged, science, test, ,, one, ...",False,True
57493,85464,"[nintendo, switch, finally, finished, repaired...",False,True


In [19]:
y_train.head()

60088    achievement
39751      affection
25149      affection
13826        bonding
37980    achievement
Name: predicted_category, dtype: object

In [20]:
y_test.head()

11836    achievement
48495      affection
6532         bonding
47650    achievement
57493    achievement
Name: predicted_category, dtype: object

In [21]:
label_to_cats = {'achievement':      (1, 0, 0, 0, 0, 0, 0),
              'affection':        (0, 1, 0, 0, 0, 0, 0),
              'enjoy_the_moment': (0, 0, 1, 0, 0, 0, 0),
              'nature':           (0, 0, 0, 1, 0, 0, 0),
              'exercise':         (0, 0, 0, 0, 1, 0, 0),
              'bonding':          (0, 0, 0, 0, 0, 1, 0),
              'leisure':          (0, 0, 0, 0, 0, 0, 1)}

In [22]:
cats_to_labels = dict()
for k, v in label_to_cats.items():
    cats_to_labels[v] = k

cats_to_labels

{(1, 0, 0, 0, 0, 0, 0): 'achievement',
 (0, 1, 0, 0, 0, 0, 0): 'affection',
 (0, 0, 1, 0, 0, 0, 0): 'enjoy_the_moment',
 (0, 0, 0, 1, 0, 0, 0): 'nature',
 (0, 0, 0, 0, 1, 0, 0): 'exercise',
 (0, 0, 0, 0, 0, 1, 0): 'bonding',
 (0, 0, 0, 0, 0, 0, 1): 'leisure'}

In [23]:
y_train_temp = []
for label in y_train:
    y_train_temp.append(label_to_cats[label])
y_train_temp = np.array(y_train_temp)

In [24]:
y_test_temp = []
for label in y_test:
    y_test_temp.append(label_to_cats[label])
y_test_temp = np.array(y_test_temp)

In [25]:
y_train_temp[0:5]

array([[1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0]])

In [26]:
def labelized_sentences(sentences, label_type):
    labelized = []
    for i, v in tqdm(enumerate(sentences)):
        label = '%s_%s'%(label_type, i)
        labelized.append(gensim.models.doc2vec.TaggedDocument(v, [label]))
    return labelized

In [27]:
x_train.head()

Unnamed: 0,hmid,tokens,24h,3m
60088,88070,"[able, plan, vacation, summer]",False,True
39751,67644,"[happiest, day, past, three, months, wedding, ...",False,True
25149,52957,"[dog, walk, politely, next, walk, ,, without, ...",True,False
13826,41583,"[first, nice, ,, long, phone, conversation, hi...",True,False
37980,65865,"[decided, begin, new, business, .]",False,True


In [28]:
w2v_train_corpus = labelized_sentences(x_train.tokens, 'TRAIN')
w2v_test_corpus = labelized_sentences(x_test.tokens, 'TEST')
w2v_train_corpus[0]

48256it [00:00, 402463.96it/s]
12065it [00:00, 597297.96it/s]


TaggedDocument(words=['able', 'plan', 'vacation', 'summer'], tags=['TRAIN_0'])

In [29]:
w2v_test_corpus[0]

TaggedDocument(words=['bought', 'ton', 'games', 'steam', 'got', 'cheap', '.'], tags=['TEST_0'])

In [30]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('D:/Datasets/embeddings/Word2Vec/GoogleNews-vectors-negative300.bin', binary=True)

In [31]:
w2v_model['working']

array([-9.08203125e-02,  1.07421875e-01,  5.55419922e-03,  3.39355469e-02,
       -1.81884766e-02,  2.17773438e-01, -1.44653320e-02, -1.60156250e-01,
       -1.43554688e-01, -1.03515625e-01, -1.12304688e-02,  1.43554688e-01,
       -3.47656250e-01, -1.02050781e-01, -1.85546875e-01, -1.98242188e-01,
        2.95410156e-02,  2.26562500e-01,  1.84570312e-01, -3.90625000e-02,
       -1.63085938e-01,  1.25976562e-01, -9.17968750e-02, -1.09863281e-01,
        1.31835938e-01, -2.91748047e-02, -6.73828125e-02,  1.57470703e-02,
       -1.43554688e-01, -8.30078125e-03,  8.69140625e-02, -8.20312500e-02,
        1.62109375e-01, -6.25000000e-02,  1.27929688e-01, -3.46679688e-02,
        1.19140625e-01, -1.46484375e-01,  9.42382812e-02, -9.81445312e-02,
        3.03955078e-02,  1.11328125e-01, -1.51367188e-01, -1.22558594e-01,
       -8.98437500e-02,  6.54296875e-02, -1.27929688e-01, -2.03857422e-02,
       -9.91210938e-02,  1.33056641e-02,  1.60156250e-01, -1.68945312e-01,
        1.56250000e-01, -

In [32]:
w2v_model['working'].shape

(300,)

In [33]:
w2v_model.most_similar('get')

  if np.issubdtype(vec.dtype, np.int):


[('getting', 0.7506691217422485),
 ('got', 0.724482536315918),
 ('gets', 0.6428857445716858),
 ('gotten', 0.6259447336196899),
 ('Getting', 0.620659351348877),
 ('go', 0.5898032188415527),
 ('come', 0.5817450284957886),
 ('give', 0.5797204971313477),
 ('bring', 0.5531654357910156),
 ('toget', 0.5505292415618896)]

In [None]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, tools='pan, wheel_zoom, box_zoom, reset, hover, previewsave', x_axis_type=None, y_axis_type=None, min_border=1)

word_vectors = [w2v_model[w] for w in list(w2v_model.wv.vocab.keys())[:5000]]

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(w2v_model.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={'word': '@words'}
show(plot_tfidf)

In [34]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in w2v_train_corpus])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [35]:
len(tfidf)

3882

In [36]:
def build_word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError:
            continue
        break
        
    if count != 0:
        vec /= count
        
    return vec

In [37]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([build_word_vector(z, 300) for z in tqdm(map(lambda x: x.words, w2v_train_corpus))])
train_vecs_w2v

48256it [00:00, 84097.51it/s]


array([[ 0.7494992 ,  0.27325493, -0.04489188, ...,  0.02207997,
        -0.08734398, -0.71827012],
       [-0.13685019, -1.26823545,  1.17067897, ..., -1.03518367,
         0.51488191,  0.45797393],
       [ 0.25559095, -0.11136463, -0.86170661, ...,  2.07393789,
        -1.77209723,  1.10999489],
       ...,
       [ 1.39628327,  0.93085551, -0.65583003, ..., -1.26934838,
        -0.2281805 ,  0.28258115],
       [ 0.28805965,  0.85929656,  0.0935787 , ..., -0.90486532,
         0.5761193 ,  0.14077491],
       [-0.40288058,  0.93445909, -0.15108021, ..., -1.5667578 ,
         1.15828168, -0.36930719]])

In [38]:
train_vecs_w2v.shape

(48256, 300)

In [39]:
train_vecs_w2v

array([[ 0.7494992 ,  0.27325493, -0.04489188, ...,  0.02207997,
        -0.08734398, -0.71827012],
       [-0.13685019, -1.26823545,  1.17067897, ..., -1.03518367,
         0.51488191,  0.45797393],
       [ 0.25559095, -0.11136463, -0.86170661, ...,  2.07393789,
        -1.77209723,  1.10999489],
       ...,
       [ 1.39628327,  0.93085551, -0.65583003, ..., -1.26934838,
        -0.2281805 ,  0.28258115],
       [ 0.28805965,  0.85929656,  0.0935787 , ..., -0.90486532,
         0.5761193 ,  0.14077491],
       [-0.40288058,  0.93445909, -0.15108021, ..., -1.5667578 ,
         1.15828168, -0.36930719]])

In [40]:
train_vecs_w2v = scale(train_vecs_w2v)

In [41]:
test_vecs_w2v = np.concatenate([build_word_vector(z, 300) for z in tqdm(map(lambda x: x.words, w2v_test_corpus))])
test_vecs_w2v

12065it [00:00, 78481.68it/s]


array([[ 7.60316968e-01, -2.46769533e-01, -3.97943676e-01, ...,
        -5.69126129e-01,  8.40350270e-01, -9.78185534e-01],
       [-1.41234673e-03,  4.35999751e-01,  4.38242871e-03, ...,
        -9.63718966e-02,  2.41926685e-01, -8.24145854e-01],
       [ 2.69515276e-01, -8.14122081e-01,  5.83639979e-01, ...,
        -8.25274408e-01, -7.52784088e-02,  4.05202299e-01],
       ...,
       [ 7.70528615e-01,  1.28036946e-01,  6.82863712e-01, ...,
         1.48568988e+00,  3.71422470e-01, -1.68408945e-01],
       [ 1.80797458e-01, -9.02864337e-01, -4.89612997e-01, ...,
        -3.48119348e-01,  2.14486435e-01, -3.86300176e-01],
       [ 4.61556911e-01,  9.85292494e-01,  4.51990992e-01, ...,
        -9.03981984e-01,  1.00920737e+00,  1.23759434e-01]])

In [42]:
test_vecs_w2v.shape

(12065, 300)

In [43]:
test_vecs_w2v = scale(test_vecs_w2v)

In [44]:
test_vecs_w2v

array([[ 0.83205233, -0.59921931, -0.51772348, ..., -0.19585362,
         0.74514248, -1.19515317],
       [-0.20689871,  0.2207956 ,  0.01627443, ...,  0.39311527,
        -0.07252069, -0.97228112],
       [ 0.16262958, -1.28061719,  0.78510925, ..., -0.51496947,
        -0.50593764,  0.80639916],
       ...,
       [ 0.84598037, -0.14907179,  0.91680656, ...,  2.36408687,
         0.10441741, -0.02352929],
       [ 0.04162404, -1.38719781, -0.639394  , ...,  0.07948208,
        -0.110014  , -0.33878485],
       [ 0.42456234,  0.88050342,  0.61037467, ..., -0.61302531,
         0.97586238,  0.39919403]])

In [45]:
y_train_temp.shape

(48256, 7)

In [46]:
y_test_temp.shape

(12065, 7)

In [47]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(256, activation='relu', input_dim=300))
model.add(Dense(64, activation='relu'))
model.add(Dense(7, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Using TensorFlow backend.


In [48]:
model.fit(train_vecs_w2v, y_train_temp, epochs=50, batch_size=32, verbose=2)

Epoch 1/50
 - 11s - loss: 1.1975 - acc: 0.5628
Epoch 2/50
 - 6s - loss: 1.1329 - acc: 0.5848
Epoch 3/50
 - 6s - loss: 1.1166 - acc: 0.5883
Epoch 4/50
 - 6s - loss: 1.1065 - acc: 0.5900
Epoch 5/50
 - 6s - loss: 1.0986 - acc: 0.5929
Epoch 6/50
 - 6s - loss: 1.0926 - acc: 0.5948
Epoch 7/50
 - 6s - loss: 1.0879 - acc: 0.5957
Epoch 8/50
 - 6s - loss: 1.0838 - acc: 0.5957
Epoch 9/50
 - 6s - loss: 1.0801 - acc: 0.5971
Epoch 10/50
 - 6s - loss: 1.0776 - acc: 0.5984
Epoch 11/50
 - 6s - loss: 1.0751 - acc: 0.5983
Epoch 12/50
 - 6s - loss: 1.0729 - acc: 0.5988
Epoch 13/50
 - 6s - loss: 1.0710 - acc: 0.5979
Epoch 14/50
 - 6s - loss: 1.0701 - acc: 0.5990
Epoch 15/50
 - 6s - loss: 1.0683 - acc: 0.5993
Epoch 16/50
 - 6s - loss: 1.0658 - acc: 0.6000
Epoch 17/50
 - 6s - loss: 1.0665 - acc: 0.6000
Epoch 18/50
 - 6s - loss: 1.0640 - acc: 0.5995
Epoch 19/50
 - 6s - loss: 1.0623 - acc: 0.6008
Epoch 20/50
 - 9s - loss: 1.0622 - acc: 0.6006
Epoch 21/50
 - 7s - loss: 1.0622 - acc: 0.6006
Epoch 22/50
 - 7s - l

<keras.callbacks.History at 0x1d83282a2e8>

In [49]:
score = model.evaluate(test_vecs_w2v, y_test_temp, batch_size=128, verbose=2)

In [50]:
score

[1.2503834371590485, 0.5794446747183454]

In [51]:
test_vecs_w2v[0]

array([ 0.83205233, -0.59921931, -0.51772348, -0.16589996, -0.14741352,
       -0.82477273,  0.21765645,  0.13409694, -0.14435001,  0.76337472,
        0.48967698,  0.09309331, -1.22230343, -0.78939625, -0.53343333,
        0.27814496, -0.97011009,  0.63620514, -1.2384566 ,  1.12556867,
       -0.28476472,  0.02098426,  0.53966281,  0.63569531,  0.7694971 ,
        1.41100995,  0.16671678,  0.31786638, -1.19357326, -0.26515389,
       -0.03073085,  1.05673118, -1.31727795,  0.90833595,  0.87915298,
       -0.07936188, -0.12579027, -0.75448046, -1.40570544,  0.1469871 ,
        0.25894568,  1.57641788, -0.4285263 , -0.98642763, -0.98029355,
       -0.82321378,  0.21288488,  0.92144931, -0.02775956, -0.07847759,
        0.86689448, -1.28859155,  0.74765666, -1.96643197, -0.11101571,
       -0.23948975, -0.03901057,  0.01047148,  0.0276817 , -0.40149277,
        0.44431782,  0.82129388,  1.09868271,  1.58525537, -0.06256438,
       -1.23332338, -0.56399116, -1.16781984,  0.2680573 ,  0.01

In [52]:
w2v_model['able']

array([ 0.1875    ,  0.06835938, -0.01123047,  0.17871094, -0.20703125,
       -0.10644531,  0.1953125 ,  0.03222656,  0.19335938, -0.12597656,
       -0.01269531, -0.02282715, -0.29492188, -0.20507812, -0.20605469,
       -0.11328125,  0.16992188, -0.03222656,  0.11474609, -0.02111816,
       -0.09375   ,  0.00179291,  0.13574219, -0.05029297, -0.08251953,
       -0.24707031, -0.05883789, -0.08203125, -0.13085938, -0.07617188,
        0.140625  , -0.00878906,  0.10351562, -0.28515625,  0.01239014,
        0.12890625,  0.08984375, -0.10839844, -0.00218201, -0.14550781,
        0.05200195, -0.02246094,  0.13183594, -0.09960938,  0.08203125,
       -0.10742188, -0.02844238,  0.06079102, -0.03857422, -0.21679688,
        0.00952148, -0.02941895, -0.10546875,  0.09716797, -0.01940918,
        0.0402832 , -0.01281738, -0.06103516,  0.30078125, -0.00497437,
        0.11181641, -0.13867188, -0.15527344, -0.09423828,  0.04736328,
        0.06982422, -0.06884766,  0.26953125, -0.0088501 , -0.02

In [53]:
tfidf['able']

3.997329171349713

In [54]:
model.save('D:/Datasets/hackerearth/models/word2vec-d256-d64-e50.h5')

In [None]:
temp_word_vector = build_word_vector(['able'], 300)

In [None]:
temp_word_vector = scale(temp_word_vector)

In [None]:
temp_word_vector

In [55]:
df_test = pd.read_csv('D:/Datasets/hackerearth/hm_test.csv')
df_test.drop(['num_sentence'], axis=1, inplace=True)

df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,88305,3m,I spent the weekend in Chicago with my friends.
1,88306,3m,We moved back into our house after a remodel. ...
2,88307,3m,My fiance proposed to me in front of my family...
3,88308,3m,I ate lobster at a fancy restaurant with some ...
4,88309,3m,I went out to a nice restaurant on a date with...


In [56]:
len(df_test)

40213

In [57]:
df_test2 = postprocess(df_test)

progress-bar: 100%|█████████████████████████████████████████████████████████████| 40213/40213 [02:29<00:00, 268.50it/s]


In [58]:
df_test2.tail()

Unnamed: 0,index,hmid,reflection_period,cleaned_hm,tokens
40208,40208,128762,24h,My husband announced he is getting a decent bo...,"[husband, announced, getting, decent, bonus, q..."
40209,40209,128763,24h,Had a can of Pepsi to drink.,"[pepsi, drink, .]"
40210,40210,128764,24h,Cuddling with my girlfriend last night.,"[cuddling, girlfriend, last, night, .]"
40211,40211,128765,24h,I had a great meeting yesterday at work with m...,"[great, meeting, yesterday, work, boss, collea..."
40212,40212,128766,24h,I had a great workout last night.,"[great, workout, last, night, .]"


In [59]:
len(df_test2)

40213

In [60]:
df_to_test = df_test2.drop(['index', 'cleaned_hm'], axis=1)

In [61]:
df_to_test.head()

Unnamed: 0,hmid,reflection_period,tokens
0,88305,3m,"[spent, weekend, chicago, friends, .]"
1,88306,3m,"[moved, back, house, remodel, ., lived, hotel,..."
2,88307,3m,"[fiance, proposed, front, family, beginning, m..."
3,88308,3m,"[ate, lobster, fancy, restaurant, friends, .]"
4,88309,3m,"[went, nice, restaurant, date, wife, ., popula..."


In [62]:
df_to_test['24h'] = pd.to_numeric(df_test['reflection_period'] == '24h')
df_to_test['3m'] = pd.to_numeric(df_test['reflection_period'] == '3m')
df_to_test.head()

Unnamed: 0,hmid,reflection_period,tokens,24h,3m
0,88305,3m,"[spent, weekend, chicago, friends, .]",False,True
1,88306,3m,"[moved, back, house, remodel, ., lived, hotel,...",False,True
2,88307,3m,"[fiance, proposed, front, family, beginning, m...",False,True
3,88308,3m,"[ate, lobster, fancy, restaurant, friends, .]",False,True
4,88309,3m,"[went, nice, restaurant, date, wife, ., popula...",False,True


In [63]:
df_to_test.tail()

Unnamed: 0,hmid,reflection_period,tokens,24h,3m
40208,128762,24h,"[husband, announced, getting, decent, bonus, q...",True,False
40209,128763,24h,"[pepsi, drink, .]",True,False
40210,128764,24h,"[cuddling, girlfriend, last, night, .]",True,False
40211,128765,24h,"[great, meeting, yesterday, work, boss, collea...",True,False
40212,128766,24h,"[great, workout, last, night, .]",True,False


In [64]:
w2v_finaltest_corpus = labelized_sentences(df_to_test.tokens, 'FTEST')

40213it [00:00, 665670.31it/s]


In [65]:
w2v_finaltest_corpus[0]

TaggedDocument(words=['spent', 'weekend', 'chicago', 'friends', '.'], tags=['FTEST_0'])

In [66]:
t_vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
t_matrix = t_vectorizer.fit_transform([x.words for x in w2v_finaltest_corpus])
t_tfidf = dict(zip(t_vectorizer.get_feature_names(), t_vectorizer.idf_))

In [67]:
len(t_tfidf)

3410

In [68]:
ftest_vecs_w2v = np.concatenate([build_word_vector(z, 300) for z in tqdm(map(lambda x: x.words, w2v_finaltest_corpus))])

40213it [00:00, 75851.58it/s]


In [69]:
ftest_vecs_w2v

array([[ 1.37737083,  0.87410074,  0.18011773, ..., -0.22117397,
         0.99064749, -0.02797785],
       [-0.09140459,  1.82964754, -0.75301826, ..., -0.77479976,
         0.87126076, -0.34383684],
       [-0.83390683, -2.70347214, -1.27776051, ..., -1.04910862,
         0.38332814, -0.42367849],
       ...,
       [ 0.46073484, -1.47598243, -1.52490997, ...,  0.20182632,
        -1.33735418,  2.72363615],
       [ 0.31513256,  0.91324127, -0.12487395, ..., -0.73745304,
        -0.09164569, -0.62597758],
       [ 0.31513256,  0.91324127, -0.12487395, ..., -0.73745304,
        -0.09164569, -0.62597758]])

In [70]:
ftest_vecs_w2v.shape

(40213, 300)

In [71]:
preds = model.predict(ftest_vecs_w2v, batch_size=128)

In [72]:
preds.shape

(40213, 7)

In [73]:
preds[0]

array([0.23947774, 0.9933524 , 0.18618575, 0.10845323, 0.09324333,
       0.29971734, 0.2727763 ], dtype=float32)

In [77]:
categories = []
for pred in preds:
    dummy = np.zeros((7,))
    dummy[np.argmax(pred)] = 1
    categories.append(cats_to_labels[tuple(dummy)])

In [78]:
categories = np.array(categories)

In [79]:
categories.shape

(40213,)

In [80]:
df_predictions = df_test.copy()

In [82]:
df_predictions.drop(['index', 'reflection_period', 'cleaned_hm', 'tokens'], axis=1, inplace=True)

In [83]:
df_predictions.head()

Unnamed: 0,hmid
0,88305
1,88306
2,88307
3,88308
4,88309


In [84]:
df_predictions['predicted_category'] = categories

In [89]:
df_predictions.to_csv('D:/Datasets/hackerearth/sumbission_word2vec-d256-d64-e50.csv', index=False)

In [87]:
df_predictions.set_index('hmid')

Unnamed: 0_level_0,predicted_category
hmid,Unnamed: 1_level_1
88305,affection
88306,achievement
88307,affection
88308,enjoy_the_moment
88309,affection
88310,leisure
88311,achievement
88312,affection
88313,affection
88314,affection
