**Imports**

In [96]:
import os
import numpy as np
import matplotlib.pyplot as plt
import gzip
import json
from matplotlib.backends.backend_pdf import PdfPages
from random import randint

1. Dataset Preparation & Analysis

1.2 - Loading the dataset

In [97]:
### goemotions.json.gz must be placed in the parent of current directory

path = os.getcwd()
f = gzip.open(os.path.abspath(os.path.join(path, os.pardir)) + '/goemotions.json.gz', 'rb')
file_content = f.read()
data_list = json.loads(file_content)

1.3.1 - Extracting the posts and 2 sets of labels (emotion and sentiment)

In [98]:
posts = []
emotions = []
sentiments = []

for entry in data_list:
    posts.append(entry[0])
    emotions.append(entry[1])
    sentiments.append(entry[2])

1.3.2 - Plotting the distribution of the posts in each category and saving the graphics

In [99]:
emotion_freq = []
emotion_labels = []
sentiment_freq = []
sentiment_labels = []

# Get n different colors
def getColors(n):
    colors = []
    for i in range(n):
        color = '#%06X' % randint(0, 0xFFFFFF)
        while color in colors:
            color = '#%06X' % randint(0, 0xFFFFFF)
        colors.append(color)
    return colors

# breakdown emotions list into frequencies and labels
for emotion in emotions:
    if emotion in emotion_labels:
        index = emotion_labels.index(emotion)
        emotion_freq[index] = emotion_freq[index] + 1
    else:
        emotion_labels.append(emotion)
        emotion_freq.append(1)

# breakdown sentiments list into frequencies and labels
for sentiment in sentiments:
    if sentiment in sentiment_labels:
        index = sentiment_labels.index(sentiment)
        sentiment_freq[index] = sentiment_freq[index] + 1
    else:
        sentiment_labels.append(sentiment)
        sentiment_freq.append(1)

with PdfPages('frequency_charts.pdf') as pdf:
    # Plotting emotions frequencies
    plt.pie(emotion_freq, labels=emotion_labels, 
            startangle=90, colors=getColors(len(emotion_labels)),
            rotatelabels=True, counterclock=False, 
            explode=[0.1 for emotion in emotion_labels], shadow=True)
    plt.title('Emotion Frequencies', y=1.25)
    # Because many labels and small frequencies, 
    # manually calculating percentages for display in legend instead of chart
    emotion_distribution = [(count/sum(emotion_freq))*100 for count in emotion_freq]
    plt.legend(title="Emotions:", labels=[f'{l}: {s:0.1f}%' for l, s in zip(emotion_labels, emotion_distribution)],
            bbox_to_anchor=(1.2,0.5), loc="center right", 
            bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches="tight")
    plt.close()

    # Plotting sentiments frequencies
    plt.pie(sentiment_freq, labels=sentiment_labels, 
            startangle=90, colors=getColors(len(sentiment_labels)),
            counterclock=False, autopct='%1.1f%%',
            explode=[0.1 for sentiment in sentiment_labels], shadow=True)
    plt.title('Sentiment Frequencies')
    plt.legend(title="Sentiments:", labels=sentiment_labels, bbox_to_anchor=(1,0.5), loc="center right", bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches="tight")
    plt.close()

2. Words as Features

2.1 - Processing the dataset