In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/DATA")
!ls

In [1]:
import numpy as np
import pandas as pd

# Loading and Cleaning Data

In [2]:
df = pd.read_csv('Emotions.txt' ,sep=';', names=['text', 'category'], index_col=False)


In [3]:
df.head()

Unnamed: 0,text,category
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.category.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: category, dtype: int64

In [5]:
possible_labels = df.category.unique()
print(possible_labels)

['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [6]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

print(label_dict)

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}


In [7]:
df['label'] = df.category.replace(label_dict)
df.head(100)

Unnamed: 0,text,category,label
0,i didnt feel humiliated,sadness,0
1,i can go from feeling so hopeless to so damned...,sadness,0
2,im grabbing a minute to post i feel greedy wrong,anger,1
3,i am ever feeling nostalgic about the fireplac...,love,2
4,i am feeling grouchy,anger,1
...,...,...,...
95,i feel like throwing away the shitty piece of ...,sadness,0
96,im starting to feel wryly amused at the banal ...,joy,5
97,i find every body beautiful and only want peop...,joy,5
98,i hear are owners who feel victimized by their...,sadness,0


In [8]:
df = df.drop(['category'], axis = 1)


In [9]:
df.head(20)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
5,ive been feeling a little burdened lately wasn...,0
6,ive been taking or milligrams or times recomme...,3
7,i feel as confused about life as a teenager or...,4
8,i have been with petronas for years i feel tha...,5
9,i feel romantic too,2


# Training/Validation Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_val, y_train, y_val = train_test_split(df['text'], 
                                                  df['label'], 
                                                  test_size=0.20, 
                                                  random_state=17, 
                                                  stratify = df['label'])

In [12]:
x_train=x_train.to_numpy()
x_val=x_val.to_numpy()
y_train=y_train.to_numpy()
y_val=y_val.to_numpy()

In [13]:
x_train[2]

'i feel sarcastic more often than not'

In [14]:
x_train.shape

(12800,)

In [15]:
print(y_train)

[0 5 1 ... 0 0 0]


# Text Preprocessing 

In [16]:
import string
import re
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hemant./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hemant./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [18]:
def process_text(text):
    '''
    Input: 
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
    
    '''
    # remove number 
    text = re.sub('[0-9]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # remove the dates like Mar 30 2013
    text = re.sub('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2}\s\d{4}', ' ', text)
    text = re.sub(r'//', '', text)
    # tokenize texts
    text_tokens = word_tokenize(text)
    
    text_clean = []
    
    for word in text_tokens:
       if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            text_clean.append(word)
            #stem_word = stemmer.stem(word) # stemming word
            #text_clean.append(stem_word)
    
    return  text_clean

# LSTMs in Keras

In [None]:
!pip install keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

Collecting keras
  Using cached Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Collecting scipy>=0.14
  Using cached scipy-1.7.1.tar.gz (36.1 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25lerror
[31m    ERROR: Command errored out with exit status 1:
     command: /Users/hemant./miniforge3/bin/python3.9 /Users/hemant./miniforge3/lib/python3.9/site-packages/pip/_vendor/pep517/in_process/_in_process.py prepare_metadata_for_build_wheel /var/folders/1_/5jmrsdkn5zjfcj8_nmbnx2t80000gn/T/tmp__zyailg
         cwd: /private/var/folders/1_/5jmrsdkn5zjfcj8_nmbnx2t80000gn/T/pip-install-_e40ps5k/scipy_73ae5b76b2a9489ea8eb6be348e00978
    Complete output (183 lines):
    Running from SciPy source directory.
    Running scipy/linalg/_generate_pyx.py
    Running scipy/special/_generate_pyx.py
    Running scipy/stats/_generate_pyx.py
    Processing scipy/cluster/_vq.pyx
    Processing scipy/cluster/_op

[?25h  Using cached scipy-1.7.0.tar.gz (36.1 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25l|

In [None]:
## upload word emebedding file on google drive
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words, words_to_index, index_to_words, word_to_vec_map

In [None]:
words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [None]:
def text_to_indices(X, word_to_index, max_len):
    m = X.shape[0]                                   
    
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):
      processed_text = process_text(X[i])
      j = 0
      for w in processed_text:
        if (w in words) & (j < max_len):
          X_indices[i, j] = word_to_index[w]
        j += 1 

    return X_indices

In [None]:
x_train_indices = text_to_indices(x_train, word_to_index, max_len = 20 )

In [None]:
x_train_indices.shape

(12800, 20)

In [None]:
x_train_indices[45]

array([385595., 219372.,  90668.,      0.,      0.,      0.,      0.,
            0.,      0.,      0.,      0.,      0.,      0.,      0.,
            0.,      0.,      0.,      0.,      0.,      0.])

In [None]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y


In [None]:
y_train_oh = convert_to_one_hot(y_train, 6)

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
   
    vocab_len = len(word_to_index)+1                 
    emb_dim = word_to_vec_map["cucumber"].shape[0]      
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
def lstm_model(input_shape, word_to_vec_map, word_to_index):

    text_to_indices = Input(input_shape, dtype = 'int32')

    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(text_to_indices)   
    
    X = LSTM(128, return_sequences=True)(embeddings)
    
    X = Dropout(0.5)(X)
    
    X = LSTM(128, return_sequences=False)(X)
    
    X = Dropout(0.5)(X)
    
    X = Dense(6)(X)
    
    X = Activation('softmax')(X)
    
    model = Model(inputs = text_to_indices, outputs = X)
    
    return model

In [None]:
max_len= 20
model = lstm_model((max_len,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 20, 50)            20000050  
_________________________________________________________________
lstm_4 (LSTM)                (None, 20, 128)           91648     
_________________________________________________________________
dropout_4 (Dropout)          (None, 20, 128)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774 

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(x_train_indices, y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f562f7ec610>

In [None]:
X_val_indices = text_to_indices(x_val, word_to_index, max_len = 20)
Y_val_oh = convert_to_one_hot(y_val, C = 6)
loss, acc = model.evaluate(X_val_indices, Y_val_oh)

print("validation accuracy = ", acc)

validation accuracy =  0.8565624952316284


# Test your own sentence 

In [None]:
## Predict emotion 
input_string = np.array(['i am getting butterflies in my stomach'])
X = text_to_indices(input_string, word_to_index, max_len = 20)
prediction = np.argmax(model.predict(X))
labels = {'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}
for key, value in labels.items():
  if prediction == value:
    print("predicted emotion is: " + str(key))

predicted emotion is: fear
