Importing Packages

In [1]:
import pandas as pd
import numpy as np

In [2]:
# text preprocessing
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# feature extraction / vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# save and load a file
import pickle

Importing Dataset

In [3]:
# importing training data
df_train = pd.read_csv('./dailydialog_emotion_stimulus_isear/data_train.csv')

# importing testing data
df_test = pd.read_csv('./dailydialog_emotion_stimulus_isear/data_test.csv')

Exploratory Data Analysis (EDA)

In [4]:
# concatinating training and testing dataset
total_data = pd.concat([df_train, df_test])

# training dataset
print('Size of training dataset:', len(df_train))
print(df_train.Emotion.value_counts())
print()

# testing dataset
print('Size of testing dataset:', len(df_test))
print(df_test.Emotion.value_counts())
print()

# total dataset
print('Total size of dataset:', len(total_data))
print(total_data.Emotion.value_counts())

Size of training dataset: 7934
sadness    1641
joy        1619
neutral    1616
anger      1566
fear       1492
Name: Emotion, dtype: int64

Size of testing dataset: 3393
joy        707
anger      693
fear       679
sadness    676
neutral    638
Name: Emotion, dtype: int64

Total size of dataset: 11327
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: Emotion, dtype: int64


In [5]:
# total Dataset
total_data.head(10)

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...
5,sadness,When my family heard that my Mother's cousin w...
6,joy,Finding out I am chosen to collect norms for C...
7,anger,A spokesperson said : ` Glen is furious that t...
8,neutral,Yes .
9,sadness,"When I see people with burns I feel sad, actua..."


In [6]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    porter = PorterStemmer()
    stem_data = [porter.stem(word) for word in data]
        
    return stem_data

In [7]:
x_train = df_train.Text
x_test = df_test.Text

y_train = df_train.Emotion
y_test = df_test.Emotion

In [8]:
# TFIDF, unigrams and bigrams
vect = TfidfVectorizer(tokenizer = preprocess_and_tokenize, sublinear_tf = True, norm = 'l2', ngram_range = (1, 3))

# fit on our complete corpus
vect.fit_transform(total_data.Text)

# transform testing and training datasets to vectors
X_train_vect = vect.transform(x_train)
X_test_vect = vect.transform(x_test)

In [9]:
vect.vocabulary_

{'there': 161383,
 'are': 17016,
 'ton': 169798,
 'of': 111540,
 'other': 118463,
 'paint': 120612,
 'that': 152048,
 'i': 75663,
 'think': 163547,
 'better': 25693,
 'there are': 161405,
 'are ton': 17482,
 'ton of': 169799,
 'of other': 113324,
 'other paint': 118643,
 'paint that': 120629,
 'that i': 152508,
 'i think': 78639,
 'think are': 163568,
 'are better': 17088,
 'there are ton': 161423,
 'are ton of': 17483,
 'ton of other': 169800,
 'of other paint': 113330,
 'other paint that': 118644,
 'paint that i': 120630,
 'that i think': 152588,
 'i think are': 78642,
 'think are better': 163569,
 'yet': 190983,
 'the': 153603,
 'dog': 44144,
 'had': 64845,
 'grown': 64206,
 'old': 115435,
 'and': 11275,
 'less': 91361,
 'capabl': 31621,
 'one': 116795,
 'day': 39803,
 'gilli': 60881,
 'come': 35421,
 'explain': 50052,
 'with': 187133,
 'great': 63632,
 'sorrow': 143990,
 'suffer': 147875,
 'a': 0,
 'stroke': 146839,
 'must': 103541,
 'be': 22186,
 'put': 126751,
 'down': 44736,
 'y

In [10]:
print(X_train_vect)

  (0, 169800)	0.22466800904005915
  (0, 169799)	0.215220164290722
  (0, 169798)	0.215220164290722
  (0, 163569)	0.22466800904005915
  (0, 163568)	0.22466800904005915
  (0, 163547)	0.11089260991482865
  (0, 161423)	0.22466800904005915
  (0, 161405)	0.1640220802361927
  (0, 161383)	0.1000182946039042
  (0, 152588)	0.215220164290722
  (0, 152508)	0.09550398557644205
  (0, 152048)	0.06335026209123826
  (0, 120630)	0.22466800904005915
  (0, 120629)	0.215220164290722
  (0, 120612)	0.18291776973486704
  (0, 118644)	0.22466800904005915
  (0, 118643)	0.22466800904005915
  (0, 118463)	0.11360402725107643
  (0, 113330)	0.22466800904005915
  (0, 113324)	0.1793258658647461
  (0, 111540)	0.05866812655415145
  (0, 78642)	0.22466800904005915
  (0, 78639)	0.13237610238972086
  (0, 75663)	0.04115458758046488
  (0, 25693)	0.14503069757543816
  :	:
  (7933, 151767)	0.11643272376990398
  (7933, 127899)	0.20951256945582394
  (7933, 127898)	0.1768298420334078
  (7933, 127897)	0.11173432132619529
  (7933, 101

In [11]:
print(vect.idf_)

[2.11489886 8.94873845 9.64188564 ... 9.64188564 9.64188564 9.64188564]


In [12]:
print(vect.fit_transform(total_data.Text))

  (0, 163569)	0.22466800904005915
  (0, 78642)	0.22466800904005915
  (0, 152588)	0.215220164290722
  (0, 120630)	0.22466800904005915
  (0, 118644)	0.22466800904005915
  (0, 113330)	0.22466800904005915
  (0, 169800)	0.22466800904005915
  (0, 17483)	0.22466800904005915
  (0, 161423)	0.22466800904005915
  (0, 17088)	0.215220164290722
  (0, 163568)	0.22466800904005915
  (0, 78639)	0.13237610238972086
  (0, 152508)	0.09550398557644205
  (0, 120629)	0.215220164290722
  (0, 118643)	0.22466800904005915
  (0, 113324)	0.1793258658647461
  (0, 169799)	0.215220164290722
  (0, 17482)	0.22466800904005915
  (0, 161405)	0.1640220802361927
  (0, 25693)	0.14503069757543816
  (0, 163547)	0.11089260991482865
  (0, 75663)	0.04115458758046488
  (0, 152048)	0.06335026209123826
  (0, 120612)	0.18291776973486704
  (0, 118463)	0.11360402725107643
  :	:
  (11326, 147323)	0.22659420984478004
  (11326, 184558)	0.22659420984478004
  (11326, 132532)	0.22659420984478004
  (11326, 98092)	0.22659420984478004
  (11326, 

In [13]:
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train_vect, y_train)

yrf_pred = rf.predict(X_test_vect)
class_names = ['joy', 'sadness', 'anger', 'neutral', 'fear']

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, yrf_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, yrf_pred, average='micro') * 100))
cf =  confusion_matrix(y_test, yrf_pred)
print("\nConfusion Matrix:\n", cf)

Accuracy: 63.01%

F1 Score: 63.01

Confusion Matrix:
 [[396  63  55 105  74]
 [ 65 424  36  97  57]
 [ 75  63 406 122  41]
 [ 40  11  40 526  21]
 [ 83  54  70  83 386]]


In [15]:
#Create pipeline with our tf-idf vectorizer and RandomForest model
rf_model = Pipeline([
    ('tfidf', vect),
    ('clf', rf),
])

In [16]:
# save the model
filename = r'./Models/tfidf_rf.sav'
pickle.dump(rf_model, open(filename, 'wb'))

In [17]:
model = pickle.load(open(filename, 'rb'))

message = "If you don't shut up, I'll kill you"

print(model.predict([message]))

['anger']


In [18]:
# import seaborn as sns

# plt.figure(figsize=(20, 8)) 

# cf_plot = sns.heatmap(cf, annot=True, cmap = 'Blues')
# cf_plot.set_title('\n confusion matrix \n')
# cf_plot.set_xlabel('\n Predicted label \n')
# cf_plot.set_xticklabels(['joy', 'sadness', 'anger', 'neutral', 'fear'])
# cf_plot.set_ylabel('\nTrue label\n')
# cf_plot.set_yticklabels(['joy', 'sadness', 'anger', 'neutral', 'fear'])
# plt.show()