**Imports**

In [38]:
import os
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt

from random import randint
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib.backends.backend_pdf import PdfPages

# 1. Dataset Preparation & Analysis

## 1.2 - Loading the dataset

In [19]:
### goemotions.json.gz must be placed in the parent of current directory

path = os.getcwd()
f = gzip.open(os.path.abspath(os.path.join(path, os.pardir)) + '/goemotions.json.gz', 'rb')
file_content = f.read()
data_list = json.loads(file_content)

## 1.3.1 - Extracting the posts and 2 sets of labels (emotion and sentiment)

In [20]:
posts = []
emotions = []
sentiments = []

for entry in data_list:
    posts.append(entry[0])
    emotions.append(entry[1])
    sentiments.append(entry[2])

## 1.3.2 - Plotting the distribution of the posts in each category and saving the graphics

In [21]:
emotion_freq = []
emotion_labels = []
sentiment_freq = []
sentiment_labels = []

# Get n different colors
def getColors(n):
    colors = []
    for i in range(n):
        color = '#%06X' % randint(0, 0xFFFFFF)
        while color in colors:
            color = '#%06X' % randint(0, 0xFFFFFF)
        colors.append(color)
    return colors

# Breakdown emotions list into frequencies and labels
for emotion in emotions:
    if emotion in emotion_labels:
        index = emotion_labels.index(emotion)
        emotion_freq[index] = emotion_freq[index] + 1
    else:
        emotion_labels.append(emotion)
        emotion_freq.append(1)

# Breakdown sentiments list into frequencies and labels
for sentiment in sentiments:
    if sentiment in sentiment_labels:
        index = sentiment_labels.index(sentiment)
        sentiment_freq[index] = sentiment_freq[index] + 1
    else:
        sentiment_labels.append(sentiment)
        sentiment_freq.append(1)

with PdfPages('frequency_charts.pdf') as pdf:
    # Plotting emotions frequencies
    plt.pie(emotion_freq, labels=emotion_labels, 
            startangle=90, colors=getColors(len(emotion_labels)),
            rotatelabels=True, counterclock=False, 
            explode=[0.1 for emotion in emotion_labels], shadow=True)
    plt.title('Emotion Frequencies', y=1.25)
    # Because many labels and small frequencies, 
    # manually calculating percentages for display in legend instead of chart
    emotion_distribution = [(count/sum(emotion_freq))*100 for count in emotion_freq]
    plt.legend(title='Emotions:', labels=[f'{l}: {s:0.1f}%' for l, s in zip(emotion_labels, emotion_distribution)],
            bbox_to_anchor=(1.2,0.5), loc='center right', 
            bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

    # Plotting sentiments frequencies
    plt.pie(sentiment_freq, labels=sentiment_labels, 
            startangle=90, colors=getColors(len(sentiment_labels)),
            counterclock=False, autopct='%1.1f%%',
            explode=[0.1 for sentiment in sentiment_labels], shadow=True)
    plt.title('Sentiment Frequencies')
    plt.legend(title='Sentiments:', labels=sentiment_labels, bbox_to_anchor=(1,0.5), loc='center right', bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

# 2. Words as Features

## 2.1 - Processing the dataset: Extracting tokens/words and their frequencies

In [28]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(posts)
emotion_to_index = {}
sentiment_to_index = {}

# Transforming emotions to integer values
index = 0
for emotion in emotions:
    if emotion in emotion_to_index:
        # already seen
        continue
    emotion_to_index[emotion] = index
    index += 1
y_emotions = [emotion_to_index[emotion] for emotion in emotions]

# Transforming sentiments to integer values
index = 0
for sentiment in sentiments:
    if sentiment in sentiment_to_index:
        continue
    sentiment_to_index[sentiment] = index
    index += 1
y_sentiments = [sentiment_to_index[sentiment] for sentiment in sentiments]

print('Size of vocabulary: ', len(vectorizer.vocabulary_), 'tokens')

Size of vocabulary:  30449 tokens


## 2.2 - Splitting dataset

In [31]:
X_train, X_test, y_emotions_train, y_emotions_test = train_test_split(X, y_emotions, test_size=0.2, random_state=0)
X_train, X_test, y_sentiments_train, y_sentiments_test = train_test_split(X, y_sentiments, test_size=0.2, random_state=0)

## 2.3 - Training and testing classifiers for both classifications, using word frequency as features

### 2.3.1 - Base-MNB

In [34]:
base_MNB = MultinomialNB()

base_MNB_emotions_model = base_MNB.fit(X_train, y_emotions_train)
y_base_MNB_emotions_pred = base_MNB_emotions_model.predict(X_test)

base_MNB_sentiments_model = base_MNB.fit(X_train, y_sentiments_train)
y_base_MNB_sentiments_pred = base_MNB_sentiments_model.predict(X_test)

### 2.3.1 - Base-DT

In [37]:
base_DT = DecisionTreeClassifier()

base_DT_emotions_model = base_DT.fit(X_train, y_emotions_train)
y_base_DT_emotions_pred = base_DT_emotions_model.predict(X_test)

base_DT_sentiments_model = base_DT.fit(X_train, y_sentiments_train)
y_base_DT_sentiments_pred = base_DT_sentiments_model.predict(X_test)

### 2.3.3 - Base-MLP

In [40]:
base_MLP = Perceptron()

base_MLP_emotions_model = base_MLP.fit(X_train, y_emotions_train)
y_base_MLP_emotions_pred = base_MLP_emotions_model.predict(X_test)

base_MLP_sentiments_model = base_MLP.fit(X_train, y_sentiments_train)
y_base_MLP_sentiments_pred = base_MLP_sentiments_model.predict(X_test)

[3 1 0 ... 1 0 3]
