In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.models import Sequential, load_model
from sklearn.metrics import accuracy_score, f1_score
import pickle

In [None]:
dataframe_train = pd.read_csv('/content/data_train.csv')
dataframe_test = pd.read_csv('/content/data_test.csv')

In [None]:
text_test = dataframe_test.Text
text_train = dataframe_train.Text

In [None]:
emotion_test = dataframe_test.Emotion
emotion_train = dataframe_train.Emotion

In [None]:
data = pd.concat([dataframe_test,dataframe_train])
data.Emotion.value_counts()

Emotion
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: count, dtype: int64

In [None]:
htmltags = r"(<.*?>)"
urls = r"^https?:\/\/.*[\r\n]*"
punctuation = r"(\\W|\\d)"
hashtags = r"(#[\d\w\.]+)"

def cleaning_tokenization(text):
    text = re.sub(htmltags, ' ', str(text))
    text = re.sub(urls, ' ', str(text))
    text = re.sub(punctuation, ' ', str(text))
    text = re.sub(hashtags, ' ', str(text))
    text = text.strip()

    text = nltk.word_tokenize(str(text))

    return text


In [None]:
!pip install nltk
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Tokenization**

In [None]:
texts = [' '.join(cleaning_tokenization(text)) for text in data.Text]
train_texts = [' '.join(cleaning_tokenization(text)) for text in text_train]
test_texts = [' '.join(cleaning_tokenization(text)) for text in text_test]

tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)

sequence_test = tokenizer.texts_to_sequences(test_texts)
sequence_train = tokenizer.texts_to_sequences(train_texts)

index_of_words = tokenizer.word_index

#Number of unique words + reserved 0 index for padding
vocabulary_size = len(index_of_words) + 1

print("Number of unique words:",len(index_of_words))

Number of unique words: 12237


Padding

In [None]:
maximum_sequence_length = 500 #Maximum input length
text_test_pad = pad_sequences(sequence_test, maxlen = maximum_sequence_length)
text_train_pad = pad_sequences(sequence_train, maxlen = maximum_sequence_length)

text_train_pad

array([[    0,     0,     0, ...,   120,    51,   350],
       [    0,     0,     0, ...,    37,   277,   156],
       [    0,     0,     0, ...,    16,     2,  1223],
       ...,
       [    0,     0,     0, ...,   873,     4,   905],
       [    0,     0,     0, ...,     1,     6,   117],
       [    0,     0,     0, ..., 12237,   173,    13]], dtype=int32)

In [None]:
encoding = {
    'joy': 0,
    'fear': 1,
    'anger': 2,
    'sadness': 3,
    'neutral': 4
}

#Integer labels
emotion_test = [encoding[i] for i in dataframe_test.Emotion]
emotion_train = [encoding[i] for i in dataframe_train.Emotion]

#to_categorical function --- Converts a class vector (integers) to binary class matrix.
emotion_test = to_categorical(emotion_test)
emotion_train = to_categorical(emotion_train)

emotion_train

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [None]:
embedding_number_dimensions = 300 #number of dimensions for word embedding

def create_embedding_matrix(filepath, word_index, embedding_dimensions):
    '''function for importing pretrained word vectors'''
    print("Please wait...! Embedding in Process...!")
    vocabulary_size = len(word_index) + 1  #Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocabulary_size, embedding_dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dimensions]
    return embedding_matrix

filename = r"/content/drive/MyDrive/wiki-news-300d-1M.vec"
embedd_matrix = create_embedding_matrix(filename, index_of_words, embedding_number_dimensions)
embedd_matrix.shape

Please wait...! Embedding in Process...!


(12238, 300)

In [None]:
new_words = 0
for word in index_of_words:
    entry = embedd_matrix[index_of_words[word]]
    if all(v == 0 for v in entry):
        new_words = new_words + 1

print("Words found in pretrained wiki word vector: ", str(len(index_of_words) - new_words))
print("New words found: ", str(new_words))

Words found in pretrained wiki word vector:  11449
New words found:  788


In [None]:
embedding_layer = Embedding(vocabulary_size,
                         embedding_number_dimensions,
                         input_length = maximum_sequence_length,
                         weights = [embedd_matrix],
                         trainable=False)

In [None]:
# number_of_categories = 5 #Total number of emotions.
# kernel_size = 3 #Specifying the length of the convolution window.
# filters = 256 #Total number of windows you will have.

# model = Sequential()  #Initializing the neural network
# model.add(embedding_layer)
# #Used relu function to ensure that we don't have negative pixel values during computation.
# model.add(Conv1D(filters, kernel_size, activation='relu')) #Convulation layer
# model.add(GlobalMaxPooling1D()) #reduce the size of the feature map without losing important image information.
# model.add(Dense(256, activation='relu')) #dense is a fully connected layer (receives input from all neurons).
# #activation function(softmax) used in the output layer that predict a multinomial probability distribution.
# model.add(Dense(number_of_categories, activation='softmax'))

# #loss = 'categorical_crossentropy' --- loss function (measure of how good your prediction model does).
# #optimizer = 'adam' --- optimization algorithm.
# #metrics = ['accuracy'] --- specifies the evaluation criteria for the model.
# model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
# model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          3671400   
                                                                 
 conv1d (Conv1D)             (None, 498, 256)          230656    
                                                                 
 global_max_pooling1d (Glob  (None, 256)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dense_1 (Dense)             (None, 5)                 1285      
                                                                 
Total params: 3969133 (15.14 MB)
Trainable params: 297733 (1.14 MB)
Non-trainable params: 3671400 (14.01 MB)
_____________

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

number_of_categories = 5  # Total number of emotions.
embedding_dim = 100  # Example embedding dimension, modify as necessary.
input_length = 500  # Adjust based on the actual length of your input sequences.
vocab_size = 20000  # Adjust based on the actual vocabulary size, should cover all indices in the input data.

model = Sequential()  # Initializing the neural network
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))  # Example embedding layer, modify input_dim as necessary
model.add(LSTM(256, return_sequences=True))  # First LSTM layer with return_sequences=True to return the full sequence
model.add(LSTM(256))  # Second LSTM layer
model.add(Dense(256, activation='relu'))  # Fully connected layer
model.add(Dense(number_of_categories, activation='softmax'))  # Output layer

# Compile the model with the same parameters
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 500, 100)          2000000   
                                                                 
 lstm_8 (LSTM)               (None, 500, 256)          365568    
                                                                 
 lstm_9 (LSTM)               (None, 256)               525312    
                                                                 
 dense_10 (Dense)            (None, 256)               65792     
                                                                 
 dense_11 (Dense)            (None, 5)                 1285      
                                                                 
Total params: 2957957 (11.28 MB)
Trainable params: 2957957 (11.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
batch_size = 256 #The number of samples that are passed to the network at once.
epochs = 6 #The number of times a learning algorithm sees the complete dataset.

hist = model.fit(text_train_pad, emotion_train,
                 batch_size=batch_size,
                 epochs=epochs,
                 verbose=1,
                 validation_data=(text_test_pad,emotion_test))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
emotion_categories = ['joy', 'fear', 'anger', 'sadness', 'neutral']

predictions = model.predict(text_test_pad)
predictions = np.argmax(predictions, axis=1)
predictions = [emotion_categories[pred] for pred in predictions]

#Accuracy is used when the True Positives and True negatives are more important.
print("Accuracy: ", (accuracy_score(dataframe_test.Emotion, predictions) * 100),"%")
#F1-score is used when the False Negatives and False Positives are crucial.
print("\nF1 Score: ", (f1_score(dataframe_test.Emotion, predictions, average='micro') * 100))

Accuracy:  75.92101385204833 %

F1 Score:  75.92101385204833


In [None]:
predictions = model.predict(text_test_pad)



In [None]:
predictions

array([[1.9878355e-01, 1.1830651e-02, 4.0818024e-02, 7.4797338e-01,
        5.9431372e-04],
       [8.3494699e-03, 3.5978928e-01, 6.1070365e-01, 1.8762818e-02,
        2.3946695e-03],
       [1.0219124e-03, 9.2522567e-04, 9.8145562e-01, 1.0315321e-02,
        6.2819566e-03],
       ...,
       [2.0102124e-01, 2.9974026e-03, 1.3070534e-02, 4.2721578e-03,
        7.7863872e-01],
       [1.9106877e-04, 7.2168012e-05, 1.7083503e-03, 9.9712932e-01,
        8.9910434e-04],
       [1.9735522e-03, 9.1056675e-02, 8.8958418e-01, 1.3086226e-02,
        4.2993873e-03]], dtype=float32)

In [None]:
predictions = model.predict(text_test_pad)
predictions = np.argmax(predictions, axis=1)



In [None]:
predictions

array([3, 2, 2, ..., 4, 3, 2])

In [None]:
# predictions = [emotion_categories[pred] for pred in predictions]
predictions

['sadness',
 'anger',
 'anger',
 'fear',
 'sadness',
 'neutral',
 'anger',
 'neutral',
 'neutral',
 'sadness',
 'neutral',
 'joy',
 'fear',
 'anger',
 'fear',
 'joy',
 'sadness',
 'neutral',
 'joy',
 'anger',
 'joy',
 'fear',
 'joy',
 'anger',
 'neutral',
 'fear',
 'neutral',
 'joy',
 'anger',
 'anger',
 'fear',
 'neutral',
 'fear',
 'joy',
 'anger',
 'fear',
 'fear',
 'sadness',
 'sadness',
 'neutral',
 'anger',
 'sadness',
 'anger',
 'anger',
 'neutral',
 'joy',
 'anger',
 'sadness',
 'neutral',
 'anger',
 'anger',
 'fear',
 'joy',
 'joy',
 'anger',
 'sadness',
 'fear',
 'joy',
 'anger',
 'neutral',
 'neutral',
 'fear',
 'joy',
 'anger',
 'anger',
 'fear',
 'neutral',
 'joy',
 'joy',
 'joy',
 'fear',
 'sadness',
 'sadness',
 'joy',
 'neutral',
 'neutral',
 'joy',
 'fear',
 'anger',
 'sadness',
 'joy',
 'sadness',
 'fear',
 'sadness',
 'joy',
 'anger',
 'joy',
 'sadness',
 'joy',
 'anger',
 'anger',
 'anger',
 'anger',
 'joy',
 'joy',
 'anger',
 'anger',
 'fear',
 'fear',
 'neutral',


In [None]:
message = ["I was crying"]

sequence = tokenizer.texts_to_sequences(message)
padded = pad_sequences(sequence, maxlen=maximum_sequence_length)

predict = model.predict(padded)

print("Message:", message)
print("Predicted:", emotion_categories[np.argmax(predict)])

Message: ['I was crying']
Predicted: sadness


In [None]:
model.save("cnn.tflite")

In [None]:
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
predictor = load_model("cnn.tflite")

In [None]:
with open("tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

In [None]:
emotion_categories = ['joy', 'fear', 'anger', 'sadness', 'neutral']
maximum_sequence_length = 500

message = ["that day i was so scared"]

sequence = tokenizer.texts_to_sequences(message)
padded = pad_sequences(sequence, maximum_sequence_length)
predict = predictor.predict(padded)

print("Message:", message)
print("Predicted:", emotion_categories[np.argmax(predict)])

Message: ['that day i was so scared']
Predicted: fear


In [None]:
tokenizer.word_index

{'i': 1,
 'the': 2,
 'a': 3,
 'to': 4,
 'and': 5,
 'was': 6,
 'my': 7,
 'of': 8,
 'in': 9,
 'when': 10,
 'that': 11,
 'it': 12,
 'me': 13,
 'had': 14,
 'you': 15,
 'for': 16,
 'at': 17,
 'with': 18,
 'not': 19,
 'he': 20,
 'on': 21,
 "'s": 22,
 'is': 23,
 "n't": 24,
 'we': 25,
 'very': 26,
 'she': 27,
 'but': 28,
 'do': 29,
 'her': 30,
 'have': 31,
 'this': 32,
 'about': 33,
 '’': 34,
 'so': 35,
 'as': 36,
 'be': 37,
 'his': 38,
 'did': 39,
 'an': 40,
 'friend': 41,
 'from': 42,
 'what': 43,
 'time': 44,
 'one': 45,
 'by': 46,
 'were': 47,
 'they': 48,
 'out': 49,
 'felt': 50,
 'are': 51,
 'all': 52,
 "'m": 53,
 'up': 54,
 'after': 55,
 'been': 56,
 'there': 57,
 'would': 58,
 'him': 59,
 'no': 60,
 'got': 61,
 'who': 62,
 'could': 63,
 'just': 64,
 'like': 65,
 'because': 66,
 'home': 67,
 'go': 68,
 'some': 69,
 'see': 70,
 'know': 71,
 'our': 72,
 'can': 73,
 'good': 74,
 'day': 75,
 'get': 76,
 'first': 77,
 'how': 78,
 'your': 79,
 'which': 80,
 'am': 81,
 'night': 82,
 'really': 