In [1]:
import numpy as np
import pandas as pd
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.callbacks import ReduceLROnPlateau
import matplotlib.pyplot as plt
import emoji #to show emoji 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_dir = "DATA\\train.csv" 
test_dir = "DATA\\test.csv"
emb_path = "EMBEDDINGS\\glove.6B.50d.txt" # glove file path (embeddings)

In [3]:
train_data = pd.read_csv(train_dir)
test_data = pd.read_csv(test_dir)
print("Columns : ", train_data.columns, "\n")
print(train_data.head(), '\n')
print("Number of classes : ",len(set(train_data["emoji_id"])), "\n")
print("Class Distribution : \n" + str(train_data["emoji_id"].value_counts(normalize = True, sort = False)))
print()

Columns :  Index(['sentence', 'emoji_id'], dtype='object') 

                          sentence  emoji_id
0           never talk to me again         3
1  I am proud of your achievements         2
2   It is the worst day in my life         3
3                 Miss you so much         0
4                     food is life         4 

Number of classes :  5 

Class Distribution : 
0    0.166667
1    0.143939
2    0.287879
3    0.272727
4    0.128788
Name: emoji_id, dtype: float64



## **Defining a dictionary for emojis.**

In [4]:
emoji_dict ={0:":blue_heart:",
             1:":baseball:",
             2:":grinning_face_with_big_eyes:",
             3:":tired_face:",
             4:":fork_and_knife:"}

for i in range(len(emoji_dict)):
    print(emoji.emojize(emoji_dict[i]), end =" ")

💙 ⚾ 😃 😫 🍴 

## **Loading the GloVe word vector embedding file : **

 The function loads the GloVe (word vector) file as a dictionary which contains 400000 words.


In [5]:
# function to load the glove file

def load_embeddings(gloveFile):
    print("Loading Embeddings..........",end="")
    f = open(gloveFile,  "r", encoding="utf8")
    emb = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        emb[word] = embedding
    print("**Done**.")
    print("Total words loaded : ", len(emb))
    return emb

In [6]:
emb = load_embeddings(emb_path)

Loading Embeddings..........**Done**.
Total words loaded :  400000


**DEFINING DICTIONARIES TO STORE WORDS AND INDEX**

In [7]:
word_to_index ,index_to_word = {}, {} 
index = 0
for i in emb:
    word_to_index[i] = index
    index_to_word[index] = i
    index = index + 1

## **LOADING DATA** 

Defining a pipeline to load data, preprocess and convert it to keraas compatible form 

In [8]:
def get_splitted_data(data):
    """Converts sentences into list of words. returns list of list of words."""
    splitted = []
    for line in data:
        words = line.lower().split()
        splitted.append(words)
    return splitted

In [9]:
def get_equal_len_sentences(data, max_len):
    """Converts sentences into equal lengths by padding with 0."""
    n = len(data)
    X_indices = np.zeros((n, max_len))
    for i in range(n):
        wordlist = data[i]
        j = 0 
        for word in wordlist:
            X_indices[i,j] = word_to_index[word]
            j += 1
    return X_indices

In [10]:
def load_data(traindata, testdata, trainlabels, testlabels):
    """Converts data into keras compatible form, returns xtrain, xtest, ytrain, ytest, max_len (length of longest sentence)."""
    
    split_train = get_splitted_data(traindata) #splitting sentences into list of words. 
    split_test = get_splitted_data(testdata)
    
    max_len = len(max(split_train, key=len)) # finding the length of the longest sentence.
    
    indiced_train = get_equal_len_sentences(split_train, max_len)
    indiced_test = get_equal_len_sentences(split_test, max_len)
    
    xtrain, ytrain = np.array(indiced_train), keras.utils.to_categorical(trainlabels)
    
    xtest, ytest = np.array(indiced_test), keras.utils.to_categorical(testlabels)
    
    return xtrain, xtest, ytrain, ytest, max_len

In [11]:
# Converting data from pandas dataframe to lists of data and labels.

sentence_train = train_data["sentence"].values.tolist()
id_train = train_data["emoji_id"].values.tolist()
sentence_test = test_data["sentence"].values.tolist()
id_test = test_data[" emoji_id"].values.tolist()

In [12]:
# using above defined pipeline to load data.

X_train, X_test, Y_train, Y_test, max_len = load_data(sentence_train, sentence_test, id_train, id_test)

##  Creating the model 

In [13]:
# fitting the embeddings into the keras Embedding() layer and setting trainable to False.

def pretrained_embedding_layer():
    
    vocab_len = len(word_to_index) + 1 
    emb_dim = emb["cucumber"].shape[0] 
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = emb[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)

    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [14]:
embedding_layer = pretrained_embedding_layer()

Creating the model using the pre trained embedding layer and LSTM

In [15]:
def create_model():
    
    input_shape = (max_len,)
    
    inputs = Input(input_shape, dtype= "int32")
    
    embedding_layer = pretrained_embedding_layer()
    
    embeddings = embedding_layer(inputs)
    
    x = LSTM(128, return_sequences=True)(embeddings)
    x = Dropout(0.5)(x)
    
    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.5)(x)
    
    x = LSTM(256, return_sequences=False)(x)
    x = Dropout(0.5)(x)
    
    x = Dense(5)(x)
    
    x = Activation('softmax')(x)
    
    model = Model(inputs=inputs, outputs=x) # model instance
    
    return model

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               394240    
__________

In [16]:
model = create_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Using callbacks - reduce learning rate on plateau.

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=0.00001, verbose=1)
model_history = model.fit(X_train, Y_train, epochs = 50, batch_size = 32, shuffle=True, verbose = 1, callbacks=[reduce_lr])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50

Epoch 00042: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 43/50
Epoch 44/50
Epoch 45/50

Epoch 00045: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 46/50
Epoch 47/50
Epoch 48/50

Epoch 00048: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 49/50
Epoch 50/50


## Evaluating on test data

In [18]:
loss, acc = model.evaluate(X_test, Y_test)



In [19]:
print()
print("Test accuracy = ", acc)


Test accuracy =  0.8214285714285714


Making predictions and creating sentences using the trained model.

In [20]:
pred = model.predict(X_test)

In [21]:
# showing sentences with correct predictions
for i in range(len(pred)):
    if np.argmax(pred[i]) == id_test[i]:
        print(sentence_test[i], emoji.emojize(emoji_dict[np.argmax(pred[i])]))        
print()

I want to eat 🍴
he did not answer 😫
he got a very nice raise 😃
she got me a nice present 😃
ha ha ha it was so funny  😃
he is a good friend 😃
I am upset 😫
We had such a lovely dinner tonight 😃
where is the food 🍴
Stop making this joke ha ha ha 😃
where is the ball ⚾
are you serious 😫
Let us go play baseball	 ⚾
This stupid grader is not working 😫
Congratulation for having a baby 😃
stop pissing me off  😫
I boiled rice 🍴
Why are you feeling bad 😫
I am upset 😫
give me the ball  ⚾
My grandmother is the love of my life 💙
enjoy your game  ⚾
valentine day is near  😃
I miss you so much 💙
throw the ball ⚾
My life is so boring 😫
she said yes	  😃
will you be my valentine 😃
he can pitch really well ⚾
dance with me 😃
I am hungry  🍴
See you at the restaurant 🍴
I like to laugh 😃
I will  run  ⚾
I like your jacket  💙
i miss her 💙
what is your favorite baseball game ⚾
Good job 😃
I love you to the stars and back 💙
What you did was awesome 😃
ha ha ha lol 😃
I do not want to joke 😫
you are failing this exercis

**Some sentences that were not correctly predicted**

In [23]:
for i in range(len(pred)):
    if np.argmax(pred[i]) != id_test[i]:
        print("Prediction:",sentence_test[i], emoji.emojize(emoji_dict[np.argmax(pred[i])]), "._____Correct Prediction : ",sentence_test[i], emoji.emojize(emoji_dict[id_test[i]]))

Prediction: work is hard 😃 ._____Correct Prediction :  work is hard 😫
Prediction: This girl is messing with me 💙 ._____Correct Prediction :  This girl is messing with me 😫
Prediction: work is horrible 😃 ._____Correct Prediction :  work is horrible 😫
Prediction: any suggestions for dinner 😃 ._____Correct Prediction :  any suggestions for dinner 🍴
Prediction: I love taking breaks 😫 ._____Correct Prediction :  I love taking breaks 💙
Prediction: you brighten my day 💙 ._____Correct Prediction :  you brighten my day 😃
Prediction: she is a bully 💙 ._____Correct Prediction :  she is a bully 😫
Prediction: go away	 ⚾ ._____Correct Prediction :  go away	 😫
Prediction: yesterday we lost again ⚾ ._____Correct Prediction :  yesterday we lost again 😫
Prediction: family is all I have 😃 ._____Correct Prediction :  family is all I have 💙
