In [1]:
# Dependecy imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.layers import SpatialDropout1D, Dropout

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from tqdm import tqdm # Progress bar
tqdm.pandas(desc="progress-bar")
import pandas as pd
import gensim
import numpy as np

# Local imports
from data.load_dataset import TRAIN_SET, TEST_SET

Using TensorFlow backend.


## Define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [2]:
max_fatures = 2000 # Top 2000 words

tokenizer = Tokenizer(num_words=max_fatures, split=' ')

# The training phase is by means of the fit_on_texts method and you
# can see the word index using the word_index property:
tokenizer.fit_on_texts(TRAIN_SET['Phrase'].values)

print("\nExamples:")
print('Token index for [story]', tokenizer.word_index['story'])
print('Token index for [comedy]', tokenizer.word_index['comedy'])
print('Token index for [movie]', tokenizer.word_index['movie'])

# texts_to_sequences method turns input into numerical arrays
train_data = tokenizer.texts_to_sequences(TRAIN_SET['Phrase'].values)
test_data = tokenizer.texts_to_sequences(TEST_SET['Phrase'].values)

print("\nExamples:")
print(TRAIN_SET['Phrase'][100], '-->', train_data[100])
print(TRAIN_SET['Phrase'][200], '-->', train_data[200])
print(TRAIN_SET['Phrase'][300], '-->', train_data[300])

# All Phrase numerical values reshape to match size for all
train_data_pad = pad_sequences(train_data)
test_data_pad = pad_sequences(test_data)
print("\nExample")
print(train_data[100], '-->', train_data_pad[100])

print('\nInput train data shape:', train_data_pad.shape)
print('Input test data shape:', test_data_pad.shape)


Examples:
Token index for [story] 40
Token index for [comedy] 60
Token index for [movie] 17

Examples:
would have a hard time sitting through this one  --> [93, 35, 2, 198, 59, 1072, 96, 18, 28]
 trouble every day is a plodding mess  --> [942, 124, 329, 8, 2, 1917, 607]
a source --> [2, 1214]

Example
[93, 35, 2, 198, 59, 1072, 96, 18, 28] --> [   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   93   35    2  198   59 1072   96   18   28]

Input train data shape: (156060, 45)
Input test data shape: (66292, 46)


## Split Train data into Train and Valid

In [3]:
# One Hot encoding
train_labels = pd.get_dummies(TRAIN_SET['Sentiment']).values
print('Sample labels:')
print(train_labels[0:2])

Sample labels:
[[0 1 0 0 0]
 [0 0 1 0 0]]


## Training Keras 1. attempt

With embeddings __without Word2vec__ so there are no semantic similarity here in embeddings.

### Define Hyperparameters

In [4]:
embed_dim = 128
lstm_out = 196 # Output Neurons
batch_size = 128
drop_out = 0.3

In [5]:
model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
model.add(SpatialDropout1D(drop_out))

# LSTMs
model.add(LSTM(lstm_out))
model.add(Dropout(drop_out))

model.add(Dense(5, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 45, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 45, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 196)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 985       
Total params: 511,785
Trainable params: 511,785
Non-trainable params: 0
_________________________________________________________________
None


## You don't have to be genius to spot here an overfitting

No playing with Hyperparameters. Moving along.

In [9]:
model.fit(train_data_pad, train_labels, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa9f4aece48>

## Training Keras 2. attempt

With Word2vec embeddings. The idea is that instead of mapping sequences of integer numbers to sequences of floats happens in a way which preserves the semantic affinity. There are various pretrained word2vec datasets on the net, we will use Google Word2Vec

### Load Google Word2Vec pretrained embeddings

In [4]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True, limit=500000)

## Similarity check

In [5]:
word2vec_model.most_similar('facebook')

[('Facebook', 0.7563532590866089),
 ('FaceBook', 0.7076998949050903),
 ('twitter', 0.6988551616668701),
 ('myspace', 0.6941817402839661),
 ('Twitter', 0.6642444729804993),
 ('Facebook.com', 0.6529868245124817),
 ('FacebookFacebook', 0.6162722110748291),
 ('facebook.com', 0.6135972142219543),
 ('Twitter.com', 0.6102107763290405),
 ('TwitterTwitter', 0.6085205078125)]

In [6]:
word2vec_model.most_similar('apple')

[('apples', 0.7203598022460938),
 ('pear', 0.6450696587562561),
 ('fruit', 0.6410146355628967),
 ('berry', 0.6302294731140137),
 ('pears', 0.6133961081504822),
 ('strawberry', 0.6058261394500732),
 ('peach', 0.6025873422622681),
 ('potato', 0.596093475818634),
 ('grape', 0.5935864448547363),
 ('blueberry', 0.5866668224334717)]

In [7]:
word2vec_model.most_similar('Apple')

[('Apple_AAPL', 0.7456985712051392),
 ('Apple_Nasdaq_AAPL', 0.7300410270690918),
 ('Apple_NASDAQ_AAPL', 0.7175089716911316),
 ('Apple_Computer', 0.7145973443984985),
 ('iPhone', 0.6924266219139099),
 ('Apple_NSDQ_AAPL', 0.6868604421615601),
 ('Steve_Jobs', 0.6758422255516052),
 ('iPad', 0.6580768823623657),
 ('Apple_nasdaq_AAPL', 0.6444970965385437),
 ('Apple_iPad', 0.622774600982666)]

In [8]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([row['Phrase'].split(' ') for _, row in TRAIN_SET.iterrows()])

tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

vocab size : 11385


In [9]:
def build_word_vector(tokens, size):
    # Given a list of phrase tokens, creates an averaged phrase vector.
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += word2vec_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [10]:
n_dim = word2vec_model.vector_size

train_vecs_w2v = np.concatenate(
    [build_word_vector(z, n_dim) for z in tqdm(map(lambda x: x.split(' '), list(TRAIN_SET['Phrase'].values)))])
train_vecs_w2v = scale(train_vecs_w2v)

156060it [00:07, 19715.88it/s]


In [27]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=n_dim))
model.add(Dense(5, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                19264     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 325       
Total params: 19,589
Trainable params: 19,589
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
model.fit(train_vecs_w2v, train_labels, epochs=20, batch_size=32, verbose=1, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe46d0c0b38>

# Make submission

In [30]:
test_vecs_w2v = np.concatenate(
    [build_word_vector(z, n_dim) for z in tqdm(map(lambda x: x.split(' '), list(TEST_SET['Phrase'].values)))])
test_vecs_w2v = scale(test_vecs_w2v)

66292it [00:03, 20867.45it/s]


In [31]:
test_preds = np.argmax(model.predict(test_vecs_w2v), axis=1)

In [32]:
TEST_SET['Sentiment'] = test_preds

In [33]:
TEST_SET[['PhraseId', 'Sentiment']].to_csv('data/submission.csv', encoding='utf-8', index=False)

.... In Progress