In [1]:
import numpy as np
import json

In [3]:
#Load data from json.
#Data has string and target has category.
train_X_original = []
train_Y = []
test_X_original = []
test_Y = []

###########################################################
#                                                         #
# CHANGE path_dir to EmotionLines directory               #
# It should end with '/'                                  #
###########################################################
path_dir = 'drive/My Drive/ColabNotebooks/English/EmotionLines/'







dict_emotion_value = { 'joy':0, 'neutral' : 1,'anger':2, 'disgust':3, 'sadness':4, 'surprise':5, 'fear':6, 'non-neutral':7}
with open(path_dir+'Friends/friends_train.json') as train_file:
    train_data = json.load(train_file)
    for episode in train_data:
        for phrase in episode:
            train_X_original.append(phrase['utterance'])
            train_Y.append(dict_emotion_value[phrase['emotion']])
with open(path_dir + 'Friends/friends_test.json') as test_file:
    test_data = json.load(test_file)
    for episode in test_data:
        for phrase in episode:
            test_X_original.append(phrase['utterance'])
            test_Y.append(dict_emotion_value[phrase['emotion']])

In [4]:
from nltk.corpus import stopwords 
import nltk
#from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
from nltk.stem import WordNetLemmatizer
import re
def preprocess(words):
    documents = []
    wnl = WordNetLemmatizer()
    for sen in range(0, len(words)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(words[sen]))
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Converting to Lowercase
        document = document.lower()
        # Lemmatization
        document = document.split()
        document = [wnl.lemmatize(word) for word in document]
        result = []
        for word in document: 
            if word not in stop_words:
                result.append(word)
        document = result
        document = ' '.join(document)
        documents.append(document)
    return documents


train_X = preprocess(train_X_original)
test_X = preprocess(test_X_original)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import TransformerMixin
from sklearn.calibration import CalibratedClassifierCV

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

clfs = []


#Naive bayes with pipeline(It is in practice video)
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',MultinomialNB())]) )

#Support vector classifier(support vector machine)
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',svm.SVC(kernel='linear',probability=True))]) )
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',svm.SVC(kernel='poly',probability=True))]) )
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',svm.SVC(kernel='rbf',probability=True))]) )
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',svm.SVC(kernel='sigmoid',probability=True))]) )

#Stochastic gradient descent
clfs.append( Pipeline([('vect',TfidfVectorizer()),('clf',CalibratedClassifierCV(base_estimator=SGDClassifier(loss='modified_huber'), cv=5, method='isotonic'))]) )

#Random Forest Classifier
clfs.append( Pipeline([('tfidf',TfidfVectorizer()),('to_dense', DenseTransformer()), ('rfc',RandomForestClassifier(n_estimators=500))]) )

#K-Neighbor classifier)
clfs.append( Pipeline([('tfidf',TfidfVectorizer()),('to_dense', DenseTransformer()),('clf',KNeighborsClassifier(n_neighbors=16,algorithm='brute'))]) )

#Decision Tree Classifier
clfs.append( Pipeline([('tfidf',TfidfVectorizer()),('clf', DecisionTreeClassifier(max_depth=16))]) )

#AdaBoost Classifier
clfs.append( Pipeline([('tfidf',TfidfVectorizer()),('clf', AdaBoostClassifier() )]) )

print(len(clfs))

10


In [7]:
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
from timeit import default_timer as timer



def print_prediction(clf,words,target):
    start = timer()
    prediction = clf.predict(words)
    end = timer()
    print(end - start)
    result = np.mean(prediction==target)
    print(result)
    print('accuracy', metrics.accuracy_score(prediction,target) )
    print('precision', metrics.precision_score(prediction,target,average='micro') )
    print('recall', metrics.recall_score(prediction,target,average='micro') )
    print('f1', metrics.f1_score(prediction,target,average='micro') )
    print(metrics.classification_report(prediction,target,zero_division=0))

for clf in clfs:
  clf.fit(train_X,train_Y)
  print_prediction(clf,test_X,test_Y)
  predict = clf.predict_proba(test_X)
  if predict.shape[1] != 8:
      print('NOT 8!')
      break

0.016010851000032744
0.4829956584659913
accuracy 0.4829956584659913
precision 0.4829956584659913
recall 0.4829956584659913
f1 0.4829956584659913
              precision    recall  f1-score   support

           0       0.03      0.60      0.06        15
           1       0.99      0.48      0.65      2622
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.12      0.72      0.20        46
           6       0.00      0.00      0.00         0
           7       0.04      0.27      0.07        81

    accuracy                           0.48      2764
   macro avg       0.15      0.26      0.12      2764
weighted avg       0.94      0.48      0.62      2764

1.2756925370000545
0.5
accuracy 0.5
precision 0.5
recall 0.5
f1 0.5
              precision    recall  f1-score   support

           0       0.20      0.51      0.29       122
           1       0.92    

In [8]:
predicts = []
for clf in clfs:
  predict = clf.predict_proba(train_X)
  print('One prediction ended')
  if predict.shape[1] != 8:
      print('NOT 8!')
      break
  predicts.append(predict)

One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended


In [9]:
np_predicts = np.swapaxes(np.array(predicts),0,1)
print(np_predicts.shape)


(10561, 10, 8)


In [10]:
test_predicts = []
for clf in clfs:
  test_predict = clf.predict_proba(test_X)
  print('One prediction ended')
  if test_predict.shape[1] != 8:
      print('NOT 8!')
      break
  test_predicts.append(test_predict)
test_np_predicts = np.swapaxes(np.array(test_predicts),0,1)

One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended
One prediction ended


In [11]:
from keras.layers import Reshape, Conv2D,Conv1D, GlobalMaxPooling2D,LSTM,MaxPooling1D,Flatten
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import optimizers
from keras.optimizers import Adagrad,RMSprop
def imdb_cnn_2():
    model = Sequential()
    model.add(Conv1D(2048, len(clfs), activation='relu',input_shape=(len(clfs),8)))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(512,activation='relu'))
    model.add(Dense(256,activation='tanh'))
    model.add(Dense(32,activation='softmax'))
    
    opt = Adagrad(lr = 0.008)
    model.compile(optimizer=opt,loss='sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy'])
    print(model.summary())

    return model

model = imdb_cnn_2()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 1, 2048)           165888    
_________________________________________________________________
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1049088   
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_3 (Dense)              (None, 32)                8224      
Total params: 1,354,528
Trainable params: 1,354,528
Non-trainable params: 0
____________________________________________

In [12]:
model.fit(np_predicts, train_Y, epochs=100,validation_split=0.2,verbose=2)#, verbose=0)

Train on 8448 samples, validate on 2113 samples
Epoch 1/100
 - 5s - loss: 0.5847 - sparse_categorical_accuracy: 0.8357 - val_loss: 0.3249 - val_sparse_categorical_accuracy: 0.8893
Epoch 2/100
 - 5s - loss: 0.3651 - sparse_categorical_accuracy: 0.8832 - val_loss: 0.3111 - val_sparse_categorical_accuracy: 0.8949
Epoch 3/100
 - 5s - loss: 0.3487 - sparse_categorical_accuracy: 0.8836 - val_loss: 0.3153 - val_sparse_categorical_accuracy: 0.8935
Epoch 4/100
 - 5s - loss: 0.3413 - sparse_categorical_accuracy: 0.8832 - val_loss: 0.3132 - val_sparse_categorical_accuracy: 0.8911
Epoch 5/100
 - 5s - loss: 0.3305 - sparse_categorical_accuracy: 0.8834 - val_loss: 0.3038 - val_sparse_categorical_accuracy: 0.8959
Epoch 6/100
 - 5s - loss: 0.3289 - sparse_categorical_accuracy: 0.8819 - val_loss: 0.2963 - val_sparse_categorical_accuracy: 0.8945
Epoch 7/100
 - 5s - loss: 0.3236 - sparse_categorical_accuracy: 0.8864 - val_loss: 0.2978 - val_sparse_categorical_accuracy: 0.8935
Epoch 8/100
 - 5s - loss: 0.

<keras.callbacks.callbacks.History at 0x7fc53a192748>

In [13]:
from sklearn.metrics import classification_report
# evaluate the model
loss, accuracy = model.evaluate(test_np_predicts, test_Y, verbose=0)
print('Accuracy: %f' % (accuracy*100))
prediction = model.predict(test_np_predicts)
predicted_classes = np.argmax(prediction, axis=1)
target_names = ['joy', 'neutral','anger', 'disgust', 'sadness', 'surprise', 'fear', 'non-neutral']
print(prediction)
print(classification_report(predicted_classes,test_Y,zero_division=0,target_names=target_names))

Accuracy: 47.793055
[[1.5285605e-02 7.2882867e-01 5.8721830e-03 ... 7.1102448e-07
  7.3885087e-07 6.9889910e-07]
 [3.9769458e-03 9.2161888e-01 1.0011938e-03 ... 2.2660730e-07
  1.8715198e-07 1.8382326e-07]
 [3.2354362e-02 7.5489539e-01 8.1709167e-03 ... 5.2129241e-07
  7.4034511e-07 6.8227337e-07]
 ...
 [7.8376824e-01 9.5622800e-03 1.3346227e-01 ... 1.7042907e-05
  3.0447040e-05 3.8440718e-05]
 [4.6413641e-02 4.6601933e-01 2.2804072e-02 ... 3.2028929e-06
  4.3665109e-06 3.9157430e-06]
 [3.3127025e-03 5.5105609e-01 7.4579538e-04 ... 5.1499267e-08
  4.2174783e-08 3.8607606e-08]]
              precision    recall  f1-score   support

         joy       0.25      0.41      0.31       188
     neutral       0.84      0.52      0.64      2066
       anger       0.07      0.31      0.12        39
     disgust       0.07      0.17      0.10        29
     sadness       0.18      0.32      0.23        47
    surprise       0.26      0.45      0.33       167
        fear       0.03      0.11    