In [1]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
data = pd.read_csv('emoji_data.csv', header = None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [3]:
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:",
    5: ":cold_face:"
    
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

In [4]:
X = data[0].values
Y = data[1].values
len(X)

183

# Embeddings

In [5]:
file = open('glove/glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

# content

In [6]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [7]:
len(list(embeddings.items()))

400001

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index
print(len(word2index))

312


In [9]:
Xtokens = tokenizer.texts_to_sequences(X)
print(Xtokens[:10])

[[103, 104, 3, 6, 105], [106, 3, 107], [1, 7, 108], [109, 4, 35], [36, 30], [37, 3, 19, 110, 26, 49], [1, 111, 112], [31, 67, 113], [1, 20, 114, 27], [115, 68, 38, 69, 26]]


In [10]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

maxlen = get_maxlen(Xtokens)
print(maxlen)

10


In [11]:
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')
Xtrain[:10]

array([[103, 104,   3,   6, 105,   0,   0,   0,   0,   0],
       [106,   3, 107,   0,   0,   0,   0,   0,   0,   0],
       [  1,   7, 108,   0,   0,   0,   0,   0,   0,   0],
       [109,   4,  35,   0,   0,   0,   0,   0,   0,   0],
       [ 36,  30,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 37,   3,  19, 110,  26,  49,   0,   0,   0,   0],
       [  1, 111, 112,   0,   0,   0,   0,   0,   0,   0],
       [ 31,  67, 113,   0,   0,   0,   0,   0,   0,   0],
       [  1,  20, 114,  27,   0,   0,   0,   0,   0,   0],
       [115,  68,  38,  69,  26,   0,   0,   0,   0,   0]])

In [12]:
Ytrain = to_categorical(Y)
len(Ytrain)

183

# Model

In [13]:
embed_size = 100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))
print(embedding_matrix[1])
for word, i in word2index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [14]:
len(embedding_matrix)

313

In [15]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(5, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [16]:
model.fit(Xtrain, Ytrain, epochs = 75)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x21f8bfd17b0>

In [17]:
test = ["bad", "I feel very bad", "lets eat dinner"]

test_seq = tokenizer.texts_to_sequences(test)
print(test_seq[:10])
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')
print(Xtest[:10])
y_pred = model.predict(Xtest)
print(y_pred[:10])
y_pred = np.argmax(y_pred, axis = 1)
print(y_pred[:10])
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

[[41], [1, 280, 41], [66, 94, 44]]
[[ 41   0   0   0   0   0   0   0   0   0]
 [  1 280  41   0   0   0   0   0   0   0]
 [ 66  94  44   0   0   0   0   0   0   0]]
[[0.08417216 0.03936651 0.00999647 0.8623348  0.00413017]
 [0.08103221 0.03294845 0.0079678  0.87486017 0.0031913 ]
 [0.02435349 0.06356906 0.24619932 0.00263697 0.66324115]]
[3 3 4]
bad 😞
I feel very bad 😞
lets eat dinner 🍽️


In [18]:
model.save("network1.h5")
