In [3]:
import numpy as np
import pandas as pd
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, GRU, Bidirectional, MaxPooling1D, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform, he_uniform, zeros
from keras.optimizers import SGD, Adam, Adadelta
from keras import regularizers
from keras.layers.normalization import BatchNormalization
from keras import metrics
from sklearn.externals import joblib

In [4]:
def import_textFile(filePath):
    ds_text = []
    with open(filePath,'rb') as f:
        for idx,ln in enumerate(f):
            decoded=False
            line=''
            for cp in ('cp1252', 'cp850','utf-8','utf8'):
                try:
                    line = ln.decode(cp)
                    decoded=True
                    break
                except UnicodeDecodeError:
                    pass
            if decoded:
                ds_text.insert(idx, line.rstrip())
    return ds_text

In [5]:
emo=5

In [6]:
if emo == 2: 
    dataset_path = '../Data/Datasets/Binary Classification/'
else:
    dataset_path = '../Data/Datasets/Multiclass Classification/'
    
train_ds = pd.read_csv(dataset_path+'train.csv', sep=",", header=None,index_col = False)
test_ds = pd.read_csv(dataset_path+'test.csv', sep=",", header=None,index_col = False)

x_train = train_ds[0]
y_train = train_ds[1]

x_test = test_ds[0]
y_test = test_ds[1]

In [7]:
maxLen = len(max(x_train, key=len).split())
print(maxLen)

28


In [8]:
def get_one_hot(y):
    one_hot_temp_array = array(y)
    one_hot_array = to_categorical(one_hot_temp_array)
    return one_hot_array
# invert encoding
#inverted = argmax(encoded[0])

In [9]:
#load glove diembedding_dict = dict()
embedding_dict = joblib.load('../Data/rnn_embedding_dict.pkl')
glove_words = joblib.load('../Data/rnn_glove_words.pkl')
number_to_word = glove_words
word_to_number = dict((word,idx) for idx,word in enumerate(glove_words))

In [10]:
def get_indices(input_x, word_to_number, max_len):
    m = input_x.shape[0]
    x_indices = np.zeros((m,max_len))

    for i in range(m):
        word_list =input_x.iloc[i][0].lower().split()
        for idx,word in enumerate(word_list):
            if (idx ==32):
                continue
            x_indices[i, idx] = word_to_number[word]
    return x_indices

In [11]:
def get_embedding_layer(embedding_dict, word_to_number):
    
    emb_shape = embedding_dict["cucumber"].shape[0]
    total_words = len(word_to_number) + 1
    
    emb_matrix = np.zeros((total_words,emb_shape))
    
    for word,idx in word_to_number.items():
        emb_matrix[idx, :] = embedding_dict[word]

    embedding_layer = Embedding(total_words, emb_shape, trainable = True)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [12]:
def senti_model(input_shape,embedding_dict, word_to_number):
    #np.random.seed(1)
    sentence_indices = Input(shape = input_shape, dtype = np.int32)
    embedding_layer = get_embedding_layer(embedding_dict, word_to_number)
    
    embeddings = embedding_layer(sentence_indices)   
    print(embeddings)
    X = Bidirectional(GRU(64,return_sequences=True))(embeddings)
    X = Dense(16, activation='elu')(X)
    X = Bidirectional(GRU(64,return_sequences=False))(X)
    X = Dense(64, activation='elu')(X)
    X = Dense(emo, activation='softmax', name='fc')(X)



    #X = Activation('softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = X, name='sentiment')
    
    return model

In [13]:
model = senti_model((maxLen,), embedding_dict, word_to_number)
model.summary()

Tensor("embedding_1/embedding_lookup/Identity:0", shape=(?, 28, 50), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 28)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 28, 50)            20000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 28, 128)           44160     
_________________________________________________________________
dense_1 (Dense)              (None, 28, 16)            2064      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               31104     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
__________________________________________________________

In [14]:
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
adam = Adam(lr=0.0001)
adadelta = Adadelta(lr=1.00, rho=0.95, epsilon=None, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=[metrics.categorical_accuracy])

In [15]:
x_train_indices = get_indices(x_train, word_to_number, maxLen)
y_train_oh = get_one_hot(y_train)
print(y_train_oh.shape)

(2809, 5)


In [16]:
model.fit(x_train_indices, y_train_oh, epochs = 200, batch_size = 64, shuffle=True)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200


Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200


Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x1eb83615128>

In [17]:
x_test_indices = get_indices(x_test, word_to_number, max_len = maxLen)
y_test_oh = get_one_hot(y_test)
loss, acc = model.evaluate(x_test_indices, y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.3094983991780174


In [None]:
def get_emotion(value):
    unique_labels = ["Happy","Surprise","Anger","Fear","Sadness"]
    return unique_labels[value]

In [None]:
C = 4
y_test_array = y_test.values
y_test_oh = np.eye(C)[y_test_array.reshape(-1)]
X_test_indices = get_indices(x_test, word_to_number, maxLen)
pred = model.predict(x_test_indices)
for i in range(len(x_test)):
    x = x_test_indices
    num = np.argmax(pred[i])
    if(num != y_test_array[i]):
        print('Expected '+ get_emotion(y_test_array[i]) + ' prediction: '+ get_emotion(num) + ' ' + x_test[i] )