In [2]:
import pandas as pd 
import numpy as np
import json 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from tensorflow import keras
import tensorflow as tf 
from keras.callbacks import EarlyStopping
import tensorflow_hub as hub 
import tensorflow_text as text
from tensorflow.keras import optimizers

In [3]:
df = pd.read_excel(r'C:\Users\Mrulay\OneDrive - University of Windsor\uWindsor\COMP 8700 - Intro to AI\Project\data\emotionDetection\train.xlsx', index_col=False)

In [4]:
stop = stopwords.words('english')

def preProcessText(text):
    text = text.lower()
    text = text.replace(r'[^\w\s]+', '') 
    text = text.replace('@', '')
    text = ' '.join(word.lower() for word in text.split() if word not in stop)
    return text

In [6]:
df.Sentences = df.Sentences.apply(preProcessText)

In [7]:
df.Sentences = df.Sentences.apply(gensim.utils.simple_preprocess)

In [9]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=6 ,
)

In [11]:
model.build_vocab(df.Sentences, progress_per=1000)

In [12]:
token = Tokenizer(7229)
token.fit_on_texts(df['Sentences'])
text = token.texts_to_sequences(df['Sentences'])
text = pad_sequences(text, 200)
print(text[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [15]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['Emotion'])
y = keras.utils.to_categorical(y)
y[:2]

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [40]:
x_train, x_test, y_train, y_test = train_test_split(text, y, test_size=0.2, stratify=y)

In [17]:
def gensim_to_keras_embedding(model, train_embeddings=False):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    """
    keyed_vectors = model.wv  # structure holding the result of training
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    
    index_to_key = keyed_vectors.index_to_key  # which row in `weights` corresponds to which word?

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

In [18]:
es = EarlyStopping(
    monitor='accuracy',
    patience=5,
    min_delta = 0.1,
    mode='max'
)

In [20]:
keras_model = tf.keras.Sequential()
keras_model.add(gensim_to_keras_embedding(model, train_embeddings=True))
keras_model.add(tf.keras.layers.SpatialDropout1D(0.2))
keras_model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
keras_model.add(tf.keras.layers.Dense(6, activation='softmax'))

METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

learning_rate = 0.0001
optimizer = optimizers.Adam(learning_rate)
    
keras_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=METRICS)

keras_model.fit(x_train, y_train, batch_size=64, epochs=30, verbose=1, callbacks=[es])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


<keras.callbacks.History at 0x21bea634fa0>

In [22]:
keras_model.evaluate(x_test, y_test)



[0.3148401975631714,
 0.8925523161888123,
 0.7869762778282166,
 0.4871875047683716]

In [44]:
P = 0.7869762778282166
R = 0.4872
f1_score = 2*P*R/(P+R)

In [45]:
f1_score

0.6018238594292818

In [48]:
keras_model.save(r'C:\Users\Mrulay\OneDrive - University of Windsor\uWindsor\COMP 8700 - Intro to AI\Project\models\emotionDetection\RNN-W2V')

INFO:tensorflow:Assets written to: C:\Users\Mrulay\OneDrive - University of Windsor\uWindsor\COMP 8700 - Intro to AI\Project\models\emotionDetection\RNN-W2V\assets


INFO:tensorflow:Assets written to: C:\Users\Mrulay\OneDrive - University of Windsor\uWindsor\COMP 8700 - Intro to AI\Project\models\emotionDetection\RNN-W2V\assets
