In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding


Using TensorFlow backend.
paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
text=pd.read_excel('New_dataset.xlsx')
# Preprocessing thr data

In [3]:
from DataClean import DataCleaner
cleanData = DataCleaner()
sentences , emotions = cleanData.cleanData(text["Sentence"]) , text["Emotion"]

In [4]:
sentences.head()
#splitting the data into train, test and validation in 80:20 and 
# test , validation into 50:50

0    i m already feel somewhat strange give that i ...
1    i pm hehehe anyasimbi pm take a nap sweetie pm...
2    a boy phone me at night and want to talk to me...
3    i a feeling of curious satisfaction to be on t...
4                 a breakup with someone i really like
Name: Sentence, dtype: object

In [5]:
from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(sentences, emotions, test_size=.2, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [6]:
# This mthod will tokenize the sentence and append a tag with senetence number
# for Word2Vec analysis
def labelize_sentences_ug(sentences,label):
    result = []
    prefix = label
    for i, sentence in zip(sentences.index, sentences):
        result.append(TaggedDocument(sentence.split(), [prefix + '_%s' % i]))
    return result

In [7]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_sentences_ug(all_x, 'all')


In [8]:
# In Word2Vec, we are using CBOW which will detect target words from
# source context words.
# size is vector size from maximum number of words
# negative is the number of noise words to be drawn
# window is the distance between the current word and predicted word
# min_count will ignore all the words with frequency less than 2
# alpha is the learning rate

from DataClean import WordLength
wordLength = WordLength()
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, size=wordLength.getMaxWordLength(all_x), negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])


  "C extension not loaded, training will be slow. "
100%|██████████| 30000/30000 [00:00<00:00, 1794380.24it/s]


In [9]:
# We are training the vectors to predict the word from a sentence by gradually
#decreasing the learning rate
sentencesCount = len(all_x_w2v)
for epoch in range(15):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=sentencesCount, epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha


100%|██████████| 30000/30000 [00:00<00:00, 2066159.61it/s]


KeyboardInterrupt: 

In [0]:
# We are build vocabulary of words using Word2Vec with skip gram
model_ug_sg = Word2Vec(sg=1, size=wordLength.getMaxWordLength(all_x), negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])


In [0]:
for epoch in range(15):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha

# Saving the trained models

In [0]:
model_ug_cbow.save('w2v_model_ug_cbow.word2vec')
model_ug_sg.save('w2v_model_ug_sg.word2vec')


In [0]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')


In [0]:
# Creating a dictionary from which ew can extract the word vectors
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))


In [0]:
numWords = []
for sentence in sentences:
    numWords.append(len(sentence.split()))
vectorLenth = max(numWords) + 5
wordCount = 0
for i in numWords:
    wordCount = wordCount + i
wordCount

In [0]:
# Text to sequence will give a sequential representation of each sentence
tokenizer = Tokenizer(num_words=wordCount)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)


In [0]:
# We are padding the vector eqully with maximum length of the word in a sentence
x_train_seq = pad_sequences(sequences, maxlen=vectorLenth)
print('Shape of data tensor:', x_train_seq.shape)


In [0]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=vectorLenth)


In [0]:
# We are building a matrix of word vectors, by using word index number
# so that our model can refer to the corresponding vector when passed with integer sequence
matrixSize = [len(v) for v in embeddings_index.values()][0]
embedding_matrix = np.zeros((wordCount, matrixSize))
for word, i in tokenizer.word_index.items():
    if i >= wordCount:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [0]:
#np.array_equal(embedding_matrix[1326] ,embeddings_index.get('vain'))


In [0]:
#seed = 7

In [0]:
# In this we are creating a neural network to train the embedding matrix
# which itself can learn the word embeddings as the model trains.
# In this way, we are providing first initialization to the embedding layer
# so that it can learn more efficiently the task-specifed vectors.
# However, we are not using this pre-trained model  for CNN

In [0]:
from DataClean import OneHotEncoding
onehotEncoding = OneHotEncoding()
model_ptw2v = Sequential()
e = Embedding(wordCount, matrixSize ,input_length=vectorLenth)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(512, activation='relu'))
model_ptw2v.add(Dense(5, activation='softmax'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, onehotEncoding.GetOneHotEncodedMatrix(y_train), validation_data=(x_val_seq, onehotEncoding.GetOneHotEncodedMatrix(y_validation)), epochs=10, batch_size=32, verbose=2)


In [0]:
loss, accuracy = model_ptw2v.evaluate(x_train_seq, onehotEncoding.GetOneHotEncodedMatrix(y_train), verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [0]:
from keras.layers import Conv1D, GlobalMaxPooling1D
emotionCNNModel = Sequential()
e = Embedding(wordCount, matrixSize,input_length=vectorLenth)
emotionCNNModel.add(e)
# we are adding 100 filters of stride size 1 
emotionCNNModel.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
# Global max pooling layer will extract maximum value from each filter by changung
# to a one dimensional vector
emotionCNNModel.add(GlobalMaxPooling1D())
emotionCNNModel.add(Dense(256, activation='relu'))
emotionCNNModel.add(Dense(5, activation='softmax'))
emotionCNNModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
emotionCNNModel.fit(x_train_seq, onehotEncoding.GetOneHotEncodedMatrix(y_train), validation_data=(x_val_seq, onehotEncoding.GetOneHotEncodedMatrix(y_validation)), epochs=10, batch_size=32, verbose=2)

In [0]:
loss, accuracy = emotionCNNModel.evaluate(x_train_seq, onehotEncoding.GetOneHotEncodedMatrix(y_train), verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [0]:
# Test Validation
x_test_tok_val = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(x_test_tok_val, maxlen=vectorLenth)


In [0]:
#Predicting the sequenced values
predictedModel = emotionCNNModel.predict(x_test_seq)
# Finding the class from predicted values
labelledPredictedModel = np.argmax(predictedModel, axis=-1)    


In [0]:
# converting encoded values to actual classes
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoded = labelEncoder.fit_transform(emotions)
classPredictedModel = labelEncoder.inverse_transform(labelledPredictedModel)

In [0]:
#Analysing performance metrics
from DataClean import PerformanceMetrices
modelMetrices = PerformanceMetrices()
print(modelMetrices.Accuracy(classPredictedModel,y_test))
print(modelMetrices.Confusion_matrix(classPredictedModel,y_test))


In [0]:
from sklearn.metrics import precision_recall_fscore_support
precision , recall , fbetascore, support = precision_recall_fscore_support(y_test, classPredictedModel, average='macro')


In [0]:
print("Precision : " + str(precision) +"\nRecall : " + str(recall) + '\nFScore : ' + str(fbetascore) )


In [0]:
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
lb = preprocessing.LabelBinarizer()
lb.fit(y_test)
y_test = lb.transform(y_test)
y_pred = lb.transform(classPredictedModel)
print("ROC score : " + str(roc_auc_score(y_test, y_pred)))


In [0]:
#Reference : https://stackoverflow.com/questions/45332410/sklearn-roc-for-multiclass-classification
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


In [0]:
import matplotlib.pyplot as plt
# Plot of a ROC curve for a specific class
for i in range(5):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
