# Imports

In [30]:
import os
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt

from random import randint
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from matplotlib.backends.backend_pdf import PdfPages

# 1. Dataset Preparation & Analysis

## 1.2 - Loading the dataset

In [31]:
### goemotions.json.gz must be placed in the parent of current directory

path = os.getcwd()
f = gzip.open(os.path.abspath(os.path.join(path, os.pardir)) + '/goemotions.json.gz', 'rb')
file_content = f.read()
data_list = json.loads(file_content)

## 1.3.1 - Extracting the posts and 2 sets of labels (emotion and sentiment)

In [32]:
posts = []
emotions = []
sentiments = []

for entry in data_list:
    posts.append(entry[0])
    emotions.append(entry[1])
    sentiments.append(entry[2])

## 1.3.2 - Plotting the distribution of the posts in each category and saving the graphics

In [33]:
emotion_counts = []
emotion_labels = []
sentiment_counts = []
sentiment_labels = []

# Get n different colors
def getColors(n):
    colors = []
    for i in range(n):
        color = '#%06X' % randint(0, 0xFFFFFF)
        while color in colors:
            color = '#%06X' % randint(0, 0xFFFFFF)
        colors.append(color)
    return colors

# Breakdown emotions list into counts and labels
for emotion in emotions:
    if emotion in emotion_labels:
        index = emotion_labels.index(emotion)
        emotion_counts[index] = emotion_counts[index] + 1
    else:
        emotion_labels.append(emotion)
        emotion_counts.append(1)

# Breakdown sentiments list into counts and labels
for sentiment in sentiments:
    if sentiment in sentiment_labels:
        index = sentiment_labels.index(sentiment)
        sentiment_counts[index] = sentiment_counts[index] + 1
    else:
        sentiment_labels.append(sentiment)
        sentiment_counts.append(1)

# Calculating frequencies for each classification
emotion_freq = [round((count/sum(emotion_counts)), 3) for count in emotion_counts]
sentiment_freq = [round((count/sum(sentiment_counts)), 3) for count in sentiment_counts]

with PdfPages('frequency_charts.pdf') as pdf:
    # Plotting emotion frequencies
    plt.pie(emotion_counts, labels=emotion_labels, 
            startangle=90, colors=getColors(len(emotion_labels)),
            rotatelabels=True, counterclock=False, 
            explode=[0.1 for emotion in emotion_labels], shadow=True)
    plt.title('Emotion Frequencies', y=1.25)
    # Displaying distribution in legend because hard to see on the chart    
    plt.legend(title='Emotions:', labels=[f'{l}: {s:0.1f}%' for l, s in zip(emotion_labels, [freq * 100 for freq in emotion_freq])],
            bbox_to_anchor=(1.2,0.5), loc='center right', 
            bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

    # Plotting sentiment frequencies
    plt.pie(sentiment_counts, labels=sentiment_labels, 
            startangle=90, colors=getColors(len(sentiment_labels)),
            counterclock=False, autopct='%1.1f%%',
            explode=[0.1 for sentiment in sentiment_labels], shadow=True)
    plt.title('Sentiment Frequencies')
    plt.legend(title='Sentiments:', labels=sentiment_labels, bbox_to_anchor=(1,0.5), loc='center right', bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

# 2. Words as Features

## 2.1 - Processing the dataset: Extracting tokens/words and their frequencies

In [36]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(posts)

# Vectorizing emotions and sentiments
emotion_to_index = {}
sentiment_to_index = {}
index = 0

for emotion in emotions:
    if emotion in emotion_to_index:
        # already seen
        continue
    emotion_to_index[emotion] = index
    index += 1
y_emotions = [emotion_to_index[emotion] for emotion in emotions]

index = 0
for sentiment in sentiments:
    if sentiment in sentiment_to_index:
        continue
    sentiment_to_index[sentiment] = index
    index += 1
y_sentiments = [sentiment_to_index[sentiment] for sentiment in sentiments]

print('Size of vocabulary: ', len(vectorizer.vocabulary_), 'tokens')

Size of vocabulary:  30449 tokens


## 2.2 - Splitting dataset

In [37]:
X_emotions_train, X_emotions_test, y_emotions_train, y_emotions_test = train_test_split(X, y_emotions, test_size=0.2)
X_sentiments_train, X_sentiments_test, y_sentiments_train, y_sentiments_test = train_test_split(X, y_sentiments, test_size=0.2)

## 2.3 - Training and testing classifiers for both classifications, using word frequency as features

### 2.3.1 - Base-MNB

In [None]:
base_MNB = MultinomialNB()

base_MNB_emotions_model = base_MNB.fit(X_emotions_train, y_emotions_train)
y_base_MNB_emotions_pred = base_MNB_emotions_model.predict(X_emotions_test)

base_MNB_sentiments_model = base_MNB.fit(X_sentiments_train, y_sentiments_train)
y_base_MNB_sentiments_pred = base_MNB_sentiments_model.predict(X_sentiments_test)

# print(classification_report(y_emotions_test, y_base_MNB_emotions_pred, target_names=emotion_labels))
# print(classification_report(y_sentiments_test, y_base_MNB_sentiments_pred, target_names=sentiment_labels))

### 2.3.1 - Base-DT

In [None]:
base_DT = DecisionTreeClassifier()

base_DT_emotions_model = base_DT.fit(X_emotions_train, y_emotions_train)
y_base_DT_emotions_pred = base_DT_emotions_model.predict(X_emotions_test)

base_DT_sentiments_model = base_DT.fit(X_sentiments_train, y_sentiments_train)
y_base_DT_sentiments_pred = base_DT_sentiments_model.predict(X_sentiments_test)

### 2.3.3 - Base-MLP

In [None]:
base_MLP = MLPClassifier(max_iter=5) ### talk about low epochs in analysis

base_MLP_emotions_model = base_MLP.fit(X_emotions_train, y_emotions_train)
y_base_MLP_emotions_pred = base_MLP_emotions_model.predict(X_emotions_test)

base_MLP_sentiments_model = base_MLP.fit(X_sentiments_train, y_sentiments_train)
y_base_MLP_sentiments_pred = base_MLP_sentiments_model.predict(X_sentiments_test)

### 2.3.4 - Top-MNB

In [10]:
top_MNB_hyper_params = {
    # Because an alpha too small will result in numeric errors, 0 is set as 1.0e-10
    'alpha': [1.0e-10, 0.5, 1.5, 3.0]
}

top_MNB_emotions = MultinomialNB(class_prior=emotion_freq)
top_MNB_sentiments = MultinomialNB(class_prior=sentiment_freq)

top_MNB_emotions_grid_search = GridSearchCV(estimator=top_MNB_emotions, param_grid=top_MNB_hyper_params)
top_MNB_sentiments_grid_search = GridSearchCV(estimator=top_MNB_sentiments, param_grid=top_MNB_hyper_params)

top_MNB_emotions_model = top_MNB_emotions_grid_search.fit(X_emotions_train, y_emotions_train)
y_top_MNB_emotions_pred = top_MNB_emotions_model.predict(X_emotions_test)
# print('best params for emotions MNB: ', top_MNB_emotions_model.best_params_)

top_MNB_sentiments_model = top_MNB_sentiments_grid_search.fit(X_sentiments_train, y_sentiments_train)
y_top_MNB_sentiments_pred = top_MNB_sentiments_model.predict(X_sentiments_test)
# print('best params for sentiments MNB: ', top_MNB_sentiments_model.best_params_)

best params for emotions MNB:  {'alpha': 3.0}
best params for sentiments MNB:  {'alpha': 3.0}


### 2.3.5 - Top-DT

In [11]:
top_DT_hyper_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10],
    'min_samples_split': [2, 3, 4]
}

top_DT = DecisionTreeClassifier()
top_DT_grid_search = GridSearchCV(estimator=top_DT, param_grid=top_DT_hyper_params)

top_DT_emotions_model = top_DT_grid_search.fit(X_emotions_train, y_emotions_train)
y_top_DT_emotions_pred = top_DT_emotions_model.predict(X_emotions_test)

top_DT_sentiments_model = top_DT_grid_search.fit(X_sentiments_train, y_sentiments_train)
y_top_DT_sentiments_pred = top_DT_sentiments_model.predict(X_sentiments_test)

best params for emotions DT:  {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2}
best params for sentiments DT:  {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2}


### 2.3.6 - Top-MLP

In [None]:
top_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

top_MLP = MLPClassifier(max_iter=5) ### talk about low epochs in analysis
top_MLP_grid_search = GridSearchCV(estimator=top_MLP, param_grid=top_MLP_hyper_params)

top_MLP_emotions_model = top_MLP_grid_search.fit(X_emotions_train, y_emotions_train)
y_top_MLP_emotions_pred = top_MLP_emotions_model.predict(X_emotions_test)

top_MLP_sentiments_model = top_MLP_grid_search.fit(X_sentiments_train, y_sentiments_train)
y_top_MLP_sentiments_pred = top_MLP_sentiments_model.predict(X_sentiments_test)