In [1]:
pip install emoji

Collecting emoji
  Downloading emoji-0.5.4.tar.gz (43 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py): started
  Building wheel for emoji (setup.py): finished with status 'done'
  Created wheel for emoji: filename=emoji-0.5.4-py3-none-any.whl size=42179 sha256=b1f23f8affd452c81b4f51ccbb155bbf7dd079eca4895492de31c1170f792964
  Stored in directory: c:\users\jites\appdata\local\pip\cache\wheels\f6\65\82\d742fe456cd8aa21ffe6c4c1eaeedf3c2d430689811bf328e1
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import emoji as emoji

In [5]:
# All emoji's along with their unicode
#emoji.EMOJI_ALIAS_UNICODE

In [7]:
# our emoji dictionary of emojis on which we are going to work
# mapping numeric labels to each emoji unicode

emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":beaming_face_with_smiling_eyes:",
                    "3": ":downcast_face_with_sweat:",
                    "4": ":fork_and_knife:",
                   }

In [8]:
# emoji.emojize() used to convert an emoji enicode to emoji
emoji.emojize(":fire:")

'🔥'

In [9]:
# iterating over values in our dictionary
for e in emoji_dictionary.values():
    print(emoji.emojize(e))

❤️
⚾
😁
😓
🍴


## Step - 2 Processing the Custom Dataset

In [10]:
import numpy as np
import pandas as pd

In [11]:
train = pd.read_csv('dataset/train_emoji.csv',header=None)
test = pd.read_csv('dataset/test_emoji.csv',header=None)

In [12]:
train.head()

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [16]:
data = train.values

for i in range(10):
    cur_emoji_unicode = emoji_dictionary[str(data[i][1])]
    print(data[i][0],emoji.emojize(cur_emoji_unicode))

never talk to me again 😓
I am proud of your achievements 😁
It is the worst day in my life 😓
Miss you so much ❤️
food is life 🍴
I love you mum ❤️
Stop saying bullshit 😓
congratulations on your acceptance 😁
The assignment is too long  😓
I want to go play ⚾


In [17]:
print(data.shape)
# we want only 1st 2 columns where 1st column is Sentence and second column is emoji label

(132, 4)


In [18]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [20]:
XT = train[0]
Xt = test[0]

# Converting labels into one hot vector
YT = to_categorical(train[1])
Yt = to_categorical(test[1])

# as for current data we will be using only 5 labels or emojis
print(XT.shape)
print(Xt.shape)
print(YT.shape)
print(Yt.shape)

(132,)
(56,)
(132, 5)
(56, 5)


## Step - 3 Using Pre-trained glove vectors

In [23]:
## MAking our own embeddings vector for each word
embeddings = {}

with open('glove.6B.50d.txt',encoding='utf-8') as f:
    for line in f:
        # to split along the spaces in between
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:],dtype='float32')
        
        #print(word)
        #print(coeffs)
        ## coeffs is vector of float values of length 50 for each word
        embeddings[word] = coeffs

## Step - 4 Embedding layer Output

In [24]:
def getOutputEmbeddings(X):
    
    # 3D matrix of zeros where X.shape[0] gives the batch_size, 10 is maxLen of sentence
    # and 50 is glove_vecor len for each word 
    embedding_matrix_output = np.zeros((X.shape[0],10,50))
    
    # for each sentence
    for ix in range(X.shape[0]):
        
        # to remove spaces in each sentence
        X[ix] = X[ix].split()
        
        # for each word
        for jx in range(len(X[ix])):
            # RHS is a vector of length 50
            embedding_matrix_output[ix][jx] = embeddings[X[ix][jx].lower()]
            
    return embedding_matrix_output

In [25]:
# getting embedding layer output for Training and testing data
emb_XT = getOutputEmbeddings(XT)
emb_Xt = getOutputEmbeddings(Xt)

print(emb_XT.shape)
print(emb_Xt.shape)

(132, 10, 50)
(56, 10, 50)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


## Step - 5 Creating Our Stacked LSTM model

In [26]:
from keras.layers import *
from keras.models import Sequential

In [27]:
model = Sequential()

# adding first LSTM layer which will accept output of embedding
# return_seq is true as it is going to return its outputs to the LSTM layer above it
model.add(LSTM(64,input_shape=(10,50),return_sequences=True))
model.add(Dropout(0.4))

# adding second LSTM layer, return_seq = false for this
model.add(LSTM(64,input_shape=(10,50)))
model.add(Dropout(0.3))

# adding a dense layer along with activation to predixt the labels
model.add(Dense(5))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10, 64)            29440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
_________________________________________________________________
activation_1 (Activation)    (None, 5)                 0         
Total params: 62,789
Trainable params: 62,789
Non-trainable params: 0
__________________________________________________

In [28]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
model.fit(emb_XT,YT,batch_size=32,epochs=40,shuffle=True,validation_split=0.1)

Train on 118 samples, validate on 14 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x263f2330688>

In [29]:
# Accuracy is 99% and validation acc = 64%

In [30]:
# How the model performs on testing data
model.evaluate(emb_Xt,Yt)



[1.7839371306555611, 0.5892857313156128]

## Step - 6 Making Predictions

In [31]:
pred = model.predict_classes(emb_Xt)

In [32]:
for i in range(30):
    print(' '.join(Xt[i]))
    
    result = emoji.emojize(emoji_dictionary[str(np.argmax(Yt[i]))]) ## argmax as Yt is one hot vector
    predicted = emoji.emojize(emoji_dictionary[str(pred[i])])
    
    print(predicted,result)


I want to eat
🍴 🍴
he did not answer
😓 😓
he got a raise
😁 😁
she got me a present
😁 ❤️
ha ha ha it was so funny
😁 😁
he is a good friend
😁 ❤️
I am upset
😓 ❤️
We had such a lovely dinner tonight
😁 ❤️
where is the food
🍴 🍴
Stop making this joke ha ha ha
😁 😁
where is the ball
⚾ ⚾
work is hard
😁 😓
This girl is messing with me
❤️ 😓
are you serious ha ha
😓 😁
Let us go play baseball
⚾ ⚾
This stupid grader is not working
😓 😓
work is horrible
😁 😓
Congratulation for having a baby
😁 😁
stop messing around
😓 😓
any suggestions for dinner
🍴 🍴
I love taking breaks
😓 ❤️
you brighten my day
❤️ 😁
I boiled rice
🍴 🍴
she is a bully
😁 😓
Why are you feeling bad
😓 😓
I am upset
😓 😓
I worked during my birthday
😁 😓
My grandmother is the love of my life
❤️ ❤️
enjoy your break
⚾ 😁
valentine day is near
😁 ❤️
