In [1]:
# here we will implement a simple pretrained model to predict a emoji 
# given a sentence. This is a starting level implementation.
# importing required libraries
import numpy as np
import pandas as pd
import emoji as em
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
# reading the data via pandas
data = pd.read_csv('./emoji_data.csv', header=None) # as there is no header in data
data[1].value_counts()

2      55
3      43
0      28
4      22
1      18
3       9
0       4
1       1
4       1
0v2     1
2       1
Name: 1, dtype: int64

In [3]:
# as we can see the data have text and numbers. The numbers represents the 
# category of the emoji. TO use them, we need emoji librari to create mapping
# dictionary
# the emoji library has function emojize to print any emoji, such as:
em.emojize(':thumbs_up:')

'👍'

In [4]:
# lets map the integers values in our data to emojies
# lets create a dictionary 
emoji_dict = {
    0:':red_heart:',
    1: ':baseball:',
    2: ':grinning_face_with_big_eyes:',
    3: ':disappointed_face:',
    4: ':fork_and_knife_with_plate:'
}
emoji_dict

{0: ':red_heart:',
 1: ':baseball:',
 2: ':grinning_face_with_big_eyes:',
 3: ':disappointed_face:',
 4: ':fork_and_knife_with_plate:'}

In [5]:
# defing a function to where we will pass the label (key) and it will
# return us the emoji
def label_to_emoji(label):
    return em.emojize(emoji_dict[label])

label_to_emoji(0), label_to_emoji(1), label_to_emoji(3)

('❤️', '⚾', '😞')

In [6]:
# now lets separate the data from labels - here we will use 
# x -  as data and y - as labels
x = data[0].values
y = data[1].values
x, y

(array(['French macaroon is so tasty', 'work is horrible', 'I am upset',
        'throw the ball', 'Good joke',
        'what is your favorite baseball game', 'I cooked meat',
        'stop messing around', 'I want chinese food',
        'Let us go play baseball', 'you are failing this exercise',
        'yesterday we lost again', 'Good job', 'ha ha ha it was so funny',
        'I will have a cheese cake', 'Why are you feeling bad',
        'I want to joke', 'I never said yes for this',
        'the party is cancelled', 'where is the ball', 'I am frustrated',
        'ha ha ha lol', 'she said yes', 'he got a raise',
        'family is all I have', 'he can pitch really well',
        'I love to the stars and back', 'do you like pizza ',
        'You totally deserve this prize', 'I miss you so much',
        'I like your jacket ', 'she got me a present',
        'will you be my valentine', 'you failed the midterm',
        'Who is down for a restaurant', 'valentine day is near',
        

In [7]:
# it is time to use word embedding - I will be using pre-trained word embedding
# glove, introduced by stanford researchers
# please follow this link to download: http://nlp.stanford.edu/data/glove.6B.zip
# once it is downloaded: extract it and it will have four files
# each file represents the embedding with respect to their dimensions
# here, 100d file will be used - lets open it using open()
file = open('./glove.6B/glove.6B.100d.txt', encoding='utf8')
content = file.readlines()
file.close()

In [8]:
content[:50]
# here we can see that against each word there is word vector of 100 dimension

['the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062\n',
 ', -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -

In [9]:
# we are going to create a dictionary that will map the word to their 
# corresponding vector
embedding = {}

# lets iterate through each line;
for line in content:
    line = line.split()
    
    # we are going to map the first word against the remaining vector
    embedding[line[0]] = np.array(line[1:], dtype=float)


In [10]:
len(embedding.keys()) # here we can see that we have 400000 words

400000

In [11]:
# next thing we have to preprocess the data - converting the data into input 
# tokens and convert y into one-hot vector
# for tokening the input text, a tokenizer function from keras will be used
# it will convert each specific word from text into a number and number is 
# assigned to a order in which that word is appearing in the dictionary
# lets initiate the tokenizer
tokenizer = Tokenizer()
tokenizer

<keras.preprocessing.text.Tokenizer at 0x1a1a8bd5220>

In [12]:
# lets fit this onto our dataset
tokenizer.fit_on_texts(x)

In [13]:
# lets see the output of above line
word_to_index = tokenizer.word_index
len(word_to_index)

312

In [14]:
# next all words will be converted into a list of tokens
x_tokens = tokenizer.texts_to_sequences(x)
x_tokens

[[103, 104, 3, 6, 105],
 [106, 3, 107],
 [1, 7, 108],
 [109, 4, 35],
 [36, 30],
 [37, 3, 19, 110, 26, 49],
 [1, 111, 112],
 [31, 67, 113],
 [1, 20, 114, 27],
 [115, 68, 38, 69, 26],
 [2, 11, 116, 10, 70],
 [117, 50, 71, 51],
 [36, 39],
 [12, 12, 12, 22, 28, 6, 40],
 [1, 32, 21, 5, 118, 119],
 [120, 11, 2, 121, 41],
 [1, 20, 9, 30],
 [1, 72, 52, 53, 13, 10],
 [4, 122, 3, 123],
 [73, 3, 4, 35],
 [1, 7, 124],
 [12, 12, 12, 54],
 [14, 52, 53],
 [15, 23, 5, 125],
 [126, 3, 127, 1, 21],
 [15, 74, 128, 129, 75],
 [1, 18, 9, 4, 130, 55, 131],
 [29, 2, 24, 132],
 [2, 133, 134, 10, 135],
 [1, 33, 2, 6, 76],
 [1, 24, 19, 136],
 [14, 23, 16, 5, 137],
 [32, 2, 138, 8, 77],
 [2, 139, 4, 140],
 [141, 3, 56, 13, 5, 78],
 [77, 42, 3, 142],
 [43, 6, 79],
 [29, 2, 21, 5, 35],
 [15, 74, 17, 29, 143],
 [15, 80, 26],
 [50, 57, 81, 5, 144, 44, 145],
 [146, 11, 147],
 [15, 3, 5, 36, 148],
 [72, 149, 9, 16, 51],
 [1, 33, 58],
 [27, 3, 34],
 [1, 7, 150, 59],
 [6, 41, 45, 2, 151, 152, 46, 68],
 [29, 2, 20, 9, 15

In [15]:
# here we cann see that all tokens have arbitrary length -  this cannot be used 
# directly in the model. we need a fixed length input
# we will be using padding to transfer it into a fixed length vector - the fixed
# length will be the largest vector of this dataset
maxlen = 0
for data in x_tokens:
    maxlen = max(maxlen, len(data))


In [16]:
maxlen # the max length of our sentence here is:

10

In [17]:
# lets padd the x_tokens and called it training data
x_train = pad_sequences(x_tokens, maxlen=maxlen, padding='post', truncating='post')
x_train[:40]

array([[103, 104,   3,   6, 105,   0,   0,   0,   0,   0],
       [106,   3, 107,   0,   0,   0,   0,   0,   0,   0],
       [  1,   7, 108,   0,   0,   0,   0,   0,   0,   0],
       [109,   4,  35,   0,   0,   0,   0,   0,   0,   0],
       [ 36,  30,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 37,   3,  19, 110,  26,  49,   0,   0,   0,   0],
       [  1, 111, 112,   0,   0,   0,   0,   0,   0,   0],
       [ 31,  67, 113,   0,   0,   0,   0,   0,   0,   0],
       [  1,  20, 114,  27,   0,   0,   0,   0,   0,   0],
       [115,  68,  38,  69,  26,   0,   0,   0,   0,   0],
       [  2,  11, 116,  10,  70,   0,   0,   0,   0,   0],
       [117,  50,  71,  51,   0,   0,   0,   0,   0,   0],
       [ 36,  39,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 12,  12,  12,  22,  28,   6,  40,   0,   0,   0],
       [  1,  32,  21,   5, 118, 119,   0,   0,   0,   0],
       [120,  11,   2, 121,  41,   0,   0,   0,   0,   0],
       [  1,  20,   9,  30,   0,   0,   0,   0,   0,   0

In [18]:
# the x_data is ready to be used for training
# next convert the y into one-hot encoding
# as our data contain some values which we  don't need, we need to clean the
# data
y_ = [(lambda x: x.split()[0][0]) (x) for x in y.tolist()]
y = np.array(y_, dtype=object)
y

array(['4', '3', '3', '1', '2', '1', '4', '3', '4', '1', '3', '3', '2',
       '2', '4', '3', '2', '3', '3', '1', '3', '2', '2', '2', '0', '1',
       '0', '4', '2', '0', '2', '0', '0', '3', '4', '0', '2', '1', '3',
       '1', '0', '4', '0', '3', '0', '4', '2', '3', '4', '2', '2', '3',
       '0', '2', '2', '3', '2', '3', '2', '2', '3', '3', '0', '2', '3',
       '0', '2', '0', '0', '2', '3', '2', '4', '1', '3', '3', '0', '0',
       '3', '2', '0', '3', '0', '2', '2', '4', '2', '2', '0', '0', '2',
       '3', '0', '4', '2', '1', '2', '3', '3', '2', '3', '0', '3', '0',
       '2', '0', '2', '3', '4', '3', '1', '3', '4', '3', '2', '3', '3',
       '3', '1', '4', '4', '2', '2', '1', '1', '2', '3', '2', '3', '4',
       '2', '3', '0', '2', '0', '0', '4', '3', '4', '2', '3', '2', '3',
       '4', '2', '1', '2', '4', '3', '1', '3', '2', '3', '2', '2', '3',
       '3', '2', '4', '0', '0', '0', '3', '0', '0', '1', '1', '2', '2',
       '2', '0', '3', '2', '3', '3', '1', '2', '2', '4', '2', '3

In [19]:
# lets convert it into one-hot, for that, we will be using a builtin 
# function to_categorical
y_train = to_categorical(y)
y_train[:10]

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.]], dtype=float32)

# Model implementation

In [20]:
# here we are going to use pretrained word embeddings - for that we will pass 
# matrix to weights parameters in the models. Lets create that matrix
# here that matrix will have all the words embedding vectors of all the words in
# our dataset
# here the rows will be the words and the columns will be the vectors
embed_size = 100 # as we are using 100 dimension vector

# lets initialize the matrix will zero values
embedding_matrix = np.zeros((len(word_to_index)+1, embed_size))

# lets iterate over all the words in our embedding dict
for word, index in word_to_index.items():
    embed_vector = embedding[word]
    embedding_matrix[index] = embed_vector


In [21]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
         0.8062  ],
       [-0.49886 ,  0.76602 ,  0.89751 , ..., -0.41179 ,  0.40539 ,
         0.78504 ],
       ...,
       [-0.46263 ,  0.069864,  0.69095 , ..., -0.29174 ,  0.32041 ,
         0.21202 ],
       [ 0.073242,  0.11134 ,  0.62281 , ...,  0.53417 , -0.1646  ,
        -0.27516 ],
       [ 0.29019 ,  0.80497 ,  0.31187 , ..., -0.33603 ,  0.45998 ,
        -0.11278 ]])

In [22]:
# lets create the model now
model = Sequential([
    Embedding(input_dim=len(word_to_index)+1,
             output_dim=embed_size,
             input_length=maxlen,
              weights=[embedding_matrix],
              trainable=False
             ),
    # next we can specify simple LSTM or RNN layer - first LSTM
    LSTM(units=16, return_sequences=True), # return_sequence will make sure to 
                                            # return the value at each sequence
     LSTM(units=10, return_sequences=True),
    LSTM(units=4), # we do not have to specify the return_sequence at last layer
    Dense(units=5, activation='softmax'),
    
])

model.compile(optimizer='adam', loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           31300     
                                                                 
 lstm (LSTM)                 (None, 10, 16)            7488      
                                                                 
 lstm_1 (LSTM)               (None, 10, 10)            1080      
                                                                 
 lstm_2 (LSTM)               (None, 4)                 240       
                                                                 
 dense (Dense)               (None, 5)                 25        
                                                                 
Total params: 40,133
Trainable params: 8,833
Non-trainable params: 31,300
_________________________________________________________________


In [24]:
model.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a1a8d6c400>

In [25]:
# lets test the model - I am going to write the sentences to test
test = ['i feel good', 'i feel very bad', 
        'lets eat dinner']
# lets convert the words into padded tokenize sequences
test_sequ = tokenizer.texts_to_sequences(test)
x_test = pad_sequences(test_sequ, maxlen=maxlen, padding='post', 
                      truncating='post')

In [26]:
# lets make the predictions
y_predict = model.predict(x_test)
y_predict



array([[0.23779343, 0.02844701, 0.44332972, 0.27672336, 0.01370649],
       [0.10609159, 0.03752771, 0.41416153, 0.42035443, 0.02186473],
       [0.07844795, 0.11742983, 0.05582644, 0.10224558, 0.6460502 ]],
      dtype=float32)

In [27]:
# # lets get the index where max value is present
y_pred = np.argmax(y_predict, axis=1)
y_pred

array([2, 3, 4], dtype=int64)

In [28]:
# we have our tokens, lets see the emojies now
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

i feel good 😃
i feel very bad 😞
lets eat dinner 🍽️


In [29]:
# this was just an initial level implementation - model is making mistakes
# we can use large dataset to improve the results