In [1]:
import numpy as np
import pandas as pd
import emoji

from keras.layers import LSTM, Dense, SimpleRNN, Dropout
from keras.models import Sequential
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_emoji.csv',header=None)
test = pd.read_csv('test_emoji.csv',header=None)

In [3]:
emoji_dict = { 0 : ":heart:", 1 : ":baseball:", 2 : ":smile:", 3 : ":disappointed:", 4 : ":fork_and_knife:"}

In [4]:
for ix in emoji_dict.keys():
    print(ix,emoji.emojize(emoji_dict[ix], use_aliases=True))

0 ❤
1 ⚾
2 😄
3 😞
4 🍴


In [5]:
X_train, Y_train, X_test, Y_test = train[0], train[1], test[0], test[1]

In [6]:
X_train.shape,Y_train.shape

((132,), (132,))

In [7]:
X_test.shape, Y_test.shape

((56,), (56,))

In [8]:
X_train = [sent.split() for sent in X_train]
X_test = [sent.split() for sent in X_test]

In [9]:
Y_train = np_utils.to_categorical(Y_train)

In [10]:
embeddings_index = {}

f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [11]:
maxLen = 0
for sent in X_train:
    maxLen = max(maxLen, len(sent))
for sent in X_test:
    maxLen = max(maxLen, len(sent))

In [12]:
maxLen

10

In [13]:
embedding_matrix_train = np.zeros((len(X_train), 10, 50))
embedding_matrix_test = np.zeros((len(X_test), 10, 50))

for ix in range(len(X_train)):
    for ij in range(len(X_train[ix])):
        embedding_matrix_train[ix][ij] = embeddings_index[X_train[ix][ij].lower()]
        
for ix in range(len(X_test)):
    for ij in range(len(X_test[ix])):
        embedding_matrix_test[ix][ij] = embeddings_index[X_test[ix][ij].lower()]        

In [14]:
print(embedding_matrix_train.shape, embedding_matrix_test.shape)

(132, 10, 50) (56, 10, 50)


In [15]:
model = Sequential()
model.add(SimpleRNN(128, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.5))
# model.add(SimpleRNN(128, input_shape=(10,50), return_sequences=True))
# model.add(Dropout(0.5))
model.add(SimpleRNN(128, input_shape=(10,50), return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 10, 128)           22912     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
Total params: 56,453
Trainable params: 56,453
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.fit(embedding_matrix_train, Y_train, epochs=50, shuffle=True, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x126a83400>

In [18]:
y_preds = model.predict_classes(embedding_matrix_test)

In [19]:
print(np.mean(y_preds == Y_test))

0.5535714285714286


In [20]:
model = Sequential()
model.add(LSTM(128, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, input_shape=(10,50), return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_3 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total params: 223,877
Trainable params: 223,877
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model.fit(embedding_matrix_train, Y_train, epochs=50, batch_size=32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x126ad16a0>

In [23]:
y_preds = model.predict_classes(embedding_matrix_test)

In [24]:
print(np.mean(y_preds == Y_test))

0.6607142857142857
