# Imports

In [125]:
import os
import csv
import gzip
import json
import pickle
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api
import nltk
nltk.download('punkt')

from random import randint
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from matplotlib.backends.backend_pdf import PdfPages
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/juansalas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Dataset Preparation & Analysis

## 1.2 - Loading the dataset

In [126]:
### goemotions.json.gz must be placed in the parent of current directory

path = os.getcwd()
# f = gzip.open(os.path.abspath(os.path.join(path, os.pardir)) + '/goemotions.json.gz', 'rb')
# file_content = f.read()

# data_list = json.loads(file_content)

# Uncomment code below for sample load
f = open(os.path.abspath(os.path.join(path, os.pardir)) + '/goemotions_sample.json')
data_list = json.load(f)

## 1.3.1 - Extracting the posts and 2 sets of labels (emotion and sentiment)

In [127]:
posts = []
emotions = []
sentiments = []

for entry in data_list:
    posts.append(entry[0])
    emotions.append(entry[1])
    sentiments.append(entry[2])

## 1.3.2 - Plotting the distribution of the posts in each category and saving the graphics

In [128]:
emotion_counts = []
emotion_labels = []
sentiment_counts = []
sentiment_labels = []

# Get n different colors
def getColors(n):
    colors = []
    for i in range(n):
        color = '#%06X' % randint(0, 0xFFFFFF)
        while color in colors:
            color = '#%06X' % randint(0, 0xFFFFFF)
        colors.append(color)
    return colors

# Breakdown emotions list into counts and labels
for emotion in emotions:
    if emotion in emotion_labels:
        index = emotion_labels.index(emotion)
        emotion_counts[index] = emotion_counts[index] + 1
    else:
        emotion_labels.append(emotion)
        emotion_counts.append(1)

# Breakdown sentiments list into counts and labels
for sentiment in sentiments:
    if sentiment in sentiment_labels:
        index = sentiment_labels.index(sentiment)
        sentiment_counts[index] = sentiment_counts[index] + 1
    else:
        sentiment_labels.append(sentiment)
        sentiment_counts.append(1)

# Calculating frequencies for each classification
emotion_freq = [round((count/sum(emotion_counts)), 3) for count in emotion_counts]
sentiment_freq = [round((count/sum(sentiment_counts)), 3) for count in sentiment_counts]

with PdfPages('frequency_charts.pdf') as pdf:
    # Plotting emotion frequencies
    plt.pie(emotion_counts, labels=emotion_labels, 
            startangle=90, colors=getColors(len(emotion_labels)),
            rotatelabels=True, counterclock=False, 
            explode=[0.1 for emotion in emotion_labels], shadow=True)
    plt.title('Emotion Frequencies', y=1.25)
    # Displaying distribution in legend because hard to see on the chart    
    plt.legend(title='Emotions:', labels=[f'{l}: {s:0.1f}%' for l, s in zip(emotion_labels, [freq * 100 for freq in emotion_freq])],
            bbox_to_anchor=(1.2,0.5), loc='center right', 
            bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

    # Plotting sentiment frequencies
    plt.pie(sentiment_counts, labels=sentiment_labels, 
            startangle=90, colors=getColors(len(sentiment_labels)),
            counterclock=False, autopct='%1.1f%%',
            explode=[0.1 for sentiment in sentiment_labels], shadow=True)
    plt.title('Sentiment Frequencies')
    plt.legend(title='Sentiments:', labels=sentiment_labels, bbox_to_anchor=(1,0.5), loc='center right', bbox_transform=plt.gcf().transFigure)
    pdf.savefig(bbox_inches='tight')
    plt.close()

# 2. Words as Features

## 2.1 - Processing the dataset: Extracting tokens/words and their frequencies

In [129]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(posts)

# Vectorizing emotions and sentiments
emotion_to_index = {}
sentiment_to_index = {}
index = 0

for emotion in emotions:
    if emotion in emotion_to_index:
        # already seen
        continue
    emotion_to_index[emotion] = index
    index += 1
y_emotions = [emotion_to_index[emotion] for emotion in emotions]

index = 0
for sentiment in sentiments:
    if sentiment in sentiment_to_index:
        continue
    sentiment_to_index[sentiment] = index
    index += 1
y_sentiments = [sentiment_to_index[sentiment] for sentiment in sentiments]

print('Size of vocabulary: ', len(vectorizer.vocabulary_), 'tokens')

Size of vocabulary:  701 tokens


## 2.2 - Splitting dataset

In [130]:
X_train, X_test, y_emotions_train, y_emotions_test = train_test_split(X, y_emotions, test_size=0.2, random_state=0)
X_train, X_test, y_sentiments_train, y_sentiments_test = train_test_split(X, y_sentiments, test_size=0.2, random_state=0)

## 2.3 - Training and testing classifiers for both classifications, using word frequency as features

### 2.3.1 - Base-MNB

Training the base MNB model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [131]:
base_MNB = MultinomialNB()

base_MNB_emotions_model = base_MNB.fit(X_train, y_emotions_train)
with open('base_MNB_emotions_model.pkl', 'wb') as file:
    pickle.dump(base_MNB_emotions_model, file)

base_MNB_sentiments_model = base_MNB.fit(X_train, y_sentiments_train)
with open('base_MNB_sentiments_model.pkl', 'wb') as file:
    pickle.dump(base_MNB_sentiments_model, file)

Predicting the trained base MNB models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [132]:
with open('base_MNB_emotions_model.pkl', 'rb') as file:
    base_MNB_emotions_model = pickle.load(file)
    y_base_MNB_emotions_pred = base_MNB_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_MNB_emotions_performance.txt', 'w')
    f.write('Base MNB Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_MNB_emotions_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_base_MNB_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_base_MNB_emotions_pred)))
    f.close()

with open('base_MNB_sentiments_model.pkl', 'rb') as file:
    base_MNB_sentiments_model = pickle.load(file)
    y_base_MNB_sentiments_pred = base_MNB_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_MNB_sentiments_performance.txt', 'w')
    f.write('Base MNB Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_MNB_sentiments_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_base_MNB_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_base_MNB_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3.2 - Base-DT

Training the base DT model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [133]:
base_DT = DecisionTreeClassifier()

base_DT_emotions_model = base_DT.fit(X_train, y_emotions_train)
with open('base_DT_emotions_model.pkl', 'wb') as file:
    pickle.dump(base_DT_emotions_model, file)

base_DT_sentiments_model = base_DT.fit(X_train, y_sentiments_train)
with open('base_DT_sentiments_model.pkl', 'wb') as file:
    pickle.dump(base_DT_sentiments_model, file)

Predicting the trained base DT models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [134]:
with open('base_DT_emotions_model.pkl', 'rb') as file:
    base_DT_emotions_model = pickle.load(file)
    y_base_DT_emotions_pred = base_DT_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_DT_emotions_performance.txt', 'w')
    f.write('Base DT Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_DT_emotions_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_base_DT_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_base_DT_emotions_pred)))
    f.close()

with open('base_DT_sentiments_model.pkl', 'rb') as file:
    base_DT_sentiments_model = pickle.load(file)
    y_base_DT_sentiments_pred = base_DT_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_DT_sentiments_performance.txt', 'w')
    f.write('Base DT Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_DT_sentiments_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_base_DT_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_base_DT_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3.3 - Base-MLP

Training the base MLP model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [135]:
base_MLP = MLPClassifier(max_iter=2000)

base_MLP_emotions_model = base_MLP.fit(X_train, y_emotions_train)
with open('base_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(base_MLP_emotions_model, file)

base_MLP_sentiments_model = base_MLP.fit(X_train, y_sentiments_train)
with open('base_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(base_MLP_sentiments_model, file)

Predicting the trained base MLP models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [136]:
with open('base_MLP_emotions_model.pkl', 'rb') as file:
    base_MLP_emotions_model = pickle.load(file)
    y_base_MLP_emotions_pred = base_MLP_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_MLP_emotions_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_base_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_base_MLP_emotions_pred)))
    f.close()

with open('base_MLP_sentiments_model.pkl', 'rb') as file:
    base_MLP_sentiments_model = pickle.load(file)
    y_base_MLP_sentiments_pred = base_MLP_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('base_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(base_MLP_sentiments_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_base_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_base_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3.4 - Top-MNB

Training the top MNB model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [137]:
top_MNB_hyper_params = {
    # Because an alpha too small will result in numeric errors, 0 is set as 1.0e-10
    'alpha': [1.0e-10, 0.5, 1.5, 3.0]
}

# FOR SAMPLE DATASET: Removes the frequencies of the emotions not found in the training set
sample_emotions_freq = []
for idx, freq in enumerate(emotion_freq):
    if list(set(y_emotions_train)).count(idx) > 0:
        sample_emotions_freq.append(freq)

top_MNB_emotions = MultinomialNB(class_prior=sample_emotions_freq)
top_MNB_sentiments = MultinomialNB(class_prior=sentiment_freq)

top_MNB_emotions_grid_search = GridSearchCV(estimator=top_MNB_emotions, param_grid=top_MNB_hyper_params)
top_MNB_sentiments_grid_search = GridSearchCV(estimator=top_MNB_sentiments, param_grid=top_MNB_hyper_params)

top_MNB_emotions_model = top_MNB_emotions_grid_search.fit(X_train, y_emotions_train)
with open('top_MNB_emotions_model.pkl', 'wb') as file:
    pickle.dump(top_MNB_emotions_model, file)

top_MNB_sentiments_model = top_MNB_sentiments_grid_search.fit(X_train, y_sentiments_train)
with open('top_MNB_sentiments_model.pkl', 'wb') as file:
    pickle.dump(top_MNB_sentiments_model, file)

12 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 729, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 565, in _update_class_log_prior
    raise ValueError("Number of priors must match number of classes.")
ValueError: Number of priors must match number of classes.



Predicting the trained top MNB models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [138]:
with open('top_MNB_emotions_model.pkl', 'rb') as file:
    top_MNB_emotions_model = pickle.load(file)
    y_top_MNB_emotions_pred = top_MNB_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_MNB_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_MNB_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_MNB_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_top_MNB_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_top_MNB_emotions_pred)))
    f.close()

with open('top_MNB_sentiments_model.pkl', 'rb') as file:
    top_MNB_sentiments_model = pickle.load(file)
    y_top_MNB_sentiments_pred = top_MNB_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_MNB_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_MNB_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_MNB_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_top_MNB_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_top_MNB_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3.5 - Top-DT

Training the top DT model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [139]:
top_DT_hyper_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10],
    'min_samples_split': [2, 3, 4]
}

top_DT = DecisionTreeClassifier()
top_DT_grid_search = GridSearchCV(estimator=top_DT, param_grid=top_DT_hyper_params)

top_DT_emotions_model = top_DT_grid_search.fit(X_train, y_emotions_train)
with open('top_DT_emotions_model.pkl', 'wb') as file:
    pickle.dump(top_DT_emotions_model, file)

top_DT_sentiments_model = top_DT_grid_search.fit(X_train, y_sentiments_train)
with open('top_DT_sentiments_model.pkl', 'wb') as file:
    pickle.dump(top_DT_sentiments_model, file)



Predicting the trained top DT models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [140]:
with open('top_DT_emotions_model.pkl', 'rb') as file:
    top_DT_emotions_model = pickle.load(file)
    y_top_DT_emotions_pred = top_DT_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_DT_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_DT_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_DT_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_top_DT_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_top_DT_emotions_pred)))
    f.close()

with open('top_DT_sentiments_model.pkl', 'rb') as file:
    top_DT_sentiments_model = pickle.load(file)
    y_top_DT_sentiments_pred = top_DT_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_DT_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_DT_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_DT_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_top_DT_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_top_DT_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3.6 - Top-MLP

Training the top MLP model and saving to pickle files. *Run following code if trained model pickle files do not already exist*

In [141]:
top_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

top_MLP = MLPClassifier(max_iter=100)
top_MLP_grid_search = GridSearchCV(estimator=top_MLP, param_grid=top_MLP_hyper_params)

top_MLP_emotions_model = top_MLP_grid_search.fit(X_train, y_emotions_train)
with open('top_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(top_MLP_emotions_model, file)

top_MLP_sentiments_model = top_MLP_grid_search.fit(X_train, y_sentiments_train)
with open('top_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(top_MLP_sentiments_model, file)



Predicting the trained top MLP models for emotions and sentiments. *Run following code if trained model pickle files exist*

In [142]:
with open('top_MLP_emotions_model.pkl', 'rb') as file:
    top_MLP_emotions_model = pickle.load(file)
    y_top_MLP_emotions_pred = top_MLP_emotions_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_MLP_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_MLP_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_emotions_test, y_top_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_emotions_test, y_top_MLP_emotions_pred)))
    f.close()

with open('top_MLP_sentiments_model.pkl', 'rb') as file:
    top_MLP_sentiments_model = pickle.load(file)
    y_top_MLP_sentiments_pred = top_MLP_sentiments_model.predict(X_test)

    # 2.4 Classification Performance
    f = open('top_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(top_MLP_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(top_MLP_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(y_sentiments_test, y_top_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(y_sentiments_test, y_top_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 2.4 - Classification Performance

In [143]:
# NOTE: We generated individual performance files, rather than one single performance files each of the 6 classifiers for both the emotion and sentiment classifications.

## 2.5 - Different train and test splits

In [144]:
X2_emotions_train, X2_emotions_test, y2_emotions_train, y2_emotions_test = train_test_split(X, y_emotions, test_size=0.2, random_state=0)
X2_sentiments_train, X2_sentiments_test, y2_sentiments_train, y2_sentiments_test = train_test_split(X, y_sentiments, test_size=0.2, random_state=0)

In [145]:
X3_emotions_train, X3_emotions_test, y3_emotions_train, y3_emotions_test = train_test_split(X, y_emotions, test_size=0.2, random_state=1)
X3_sentiments_train, X3_sentiments_test, y3_sentiments_train, y3_sentiments_test = train_test_split(X, y_sentiments, test_size=0.2, random_state=1)

### 2.5.1 - Base MNB

In [146]:
base2_MNB_emotions_model = base_MNB.fit(X2_emotions_train, y2_emotions_train)
y2_base_MNB_emotions_pred = base2_MNB_emotions_model.predict(X2_emotions_test)

base2_MNB_sentiments_model = base_MNB.fit(X2_sentiments_train, y2_sentiments_train)
y2_base_MNB_sentiments_pred = base2_MNB_sentiments_model.predict(X2_sentiments_test)

print("Random State = 0: Base-MNB with the default parameters")

# evaluate classifier
print("\nBase-MNB Emotions Classification Report:\n", classification_report(y2_emotions_test, y2_base_MNB_emotions_pred))
print("\nBase-MNB Sentiments Classification Report:\n", classification_report(y2_sentiments_test, y2_base_MNB_sentiments_pred))

# show confusion Matrix
print("\nBase-MNB Emotions Confusion Matrix:\n", confusion_matrix(y2_emotions_test, y2_base_MNB_emotions_pred))
print("\nBase-MNB Sentiments Confusion Matrix:\n", confusion_matrix(y2_sentiments_test, y2_base_MNB_sentiments_pred))

Random State = 0: Base-MNB with the default parameters

Base-MNB Emotions Classification Report:
               precision    recall  f1-score   support

           2       0.52      0.86      0.65        14
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1

    accuracy                           0.43        28
   macro avg       0.04      0.07      0.05        28
weighted avg       0.26      0.43   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [147]:
base3_MNB_emotions_model = base_MNB.fit(X3_emotions_train, y3_emotions_train)
y3_base_MNB_emotions_pred = base3_MNB_emotions_model.predict(X3_emotions_test)

base3_MNB_sentiments_model = base_MNB.fit(X3_sentiments_train, y3_sentiments_train)
y3_base_MNB_sentiments_pred = base3_MNB_sentiments_model.predict(X3_sentiments_test)

print("Random State = 1: Base-MNB with the default parameters")

# evaluate classifier
print("\nBase-MNB Emotions Classification Report:\n", classification_report(y3_emotions_test, y3_base_MNB_emotions_pred))
print("\nBase-MNB Sentiments Classification Report:\n", classification_report(y3_sentiments_test, y3_base_MNB_sentiments_pred))

# show confusion Matrix
print("\nBase-MNB Emotions Confusion Matrix:\n", confusion_matrix(y3_emotions_test, y3_base_MNB_emotions_pred))
print("\nBase-MNB Sentiments Confusion Matrix:\n", confusion_matrix(y3_sentiments_test, y3_base_MNB_sentiments_pred))

Random State = 1: Base-MNB with the default parameters

Base-MNB Emotions Classification Report:
               precision    recall  f1-score   support

           2       0.17      0.80      0.28         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          20       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1

    accuracy                           0.14        28
   macro avg       0.01      0.06   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.5.2 - Base DT

In [148]:
base2_DT_emotions_model = base_DT.fit(X2_emotions_train, y2_emotions_train)
y2_base_DT_emotions_pred = base2_DT_emotions_model.predict(X2_emotions_test)

base2_DT_sentiments_model = base_DT.fit(X2_sentiments_train, y2_sentiments_train)
y2_base_DT_sentiments_pred = base2_DT_sentiments_model.predict(X2_sentiments_test)

print("Random State = 0: Base-DT with the default parameters")

# evaluate classifier
print("\nBase-DT Emotions Classification Report:\n", classification_report(y2_emotions_test, y2_base_DT_emotions_pred))
print("\nBase-DT Sentiments Classification Report:\n", classification_report(y2_sentiments_test, y2_base_DT_sentiments_pred))

# show confusion Matrix
print("\nBase-DT Emotions Confusion Matrix:\n", confusion_matrix(y2_emotions_test, y2_base_DT_emotions_pred))
print("\nBase-DT Sentiments Confusion Matrix:\n", confusion_matrix(y2_sentiments_test, y2_base_DT_sentiments_pred))

Random State = 0: Base-DT with the default parameters

Base-DT Emotions Classification Report:
               precision    recall  f1-score   support

           2       0.50      0.57      0.53        14
           4       0.20      0.50      0.29         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         0
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         0
          24       0.00      0.00      0.00         1

    accuracy                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [149]:
base3_DT_emotions_model = base_DT.fit(X3_emotions_train, y3_emotions_train)
y3_base_DT_emotions_pred = base3_DT_emotions_model.predict(X3_emotions_test)

base3_DT_sentiments_model = base_DT.fit(X2_sentiments_train, y3_sentiments_train)
y3_base_DT_sentiments_pred = base3_DT_sentiments_model.predict(X3_sentiments_test)

print("Random State = 1: Base-DT with the default parameters")

# evaluate classifier
print("\nBase-DT Emotions Classification Report:\n", classification_report(y3_emotions_test, y3_base_DT_emotions_pred))
print("\nBase-DT Sentiments Classification Report:\n", classification_report(y3_sentiments_test, y3_base_DT_sentiments_pred))

# show confusion Matrix
print("\nBase-DT Emotions Confusion Matrix:\n", confusion_matrix(y3_emotions_test, y3_base_DT_emotions_pred))
print("\nBase-DT Sentiments Confusion Matrix:\n", confusion_matrix(y3_sentiments_test, y3_base_DT_sentiments_pred))

Random State = 1: Base-DT with the default parameters

Base-DT Emotions Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.27      0.80      0.40         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         0
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         1
          21       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.5.3 - Base MLP

In [150]:
base2_MLP_emotions_model = base_MLP.fit(X2_emotions_train, y2_emotions_train)
y2_base_MLP_emotions_pred = base2_MLP_emotions_model.predict(X2_emotions_test)

base2_MLP_sentiments_model = base_MLP.fit(X2_sentiments_train, y2_sentiments_train)
y2_base_MLP_sentiments_pred = base2_MLP_sentiments_model.predict(X2_sentiments_test)

print("Random State = 0: Base-MLP with the default parameters")

# evaluate classifier
print("\nBase-MLP Emotions Classification Report:\n", classification_report(y2_emotions_test, y2_base_MLP_emotions_pred))
print("\nBase-MLP Sentiments Classification Report:\n", classification_report(y2_sentiments_test, y2_base_MLP_sentiments_pred))

# show confusion Matrix
print("\nBase-MLP Emotions Confusion Matrix:\n", confusion_matrix(y2_emotions_test, y2_base_MLP_emotions_pred))
print("\nBase-MLP Sentiments Confusion Matrix:\n", confusion_matrix(y2_sentiments_test, y2_base_MLP_sentiments_pred))

Random State = 0: Base-MLP with the default parameters

Base-MLP Emotions Classification Report:
               precision    recall  f1-score   support

           2       0.60      0.86      0.71        14
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         0
          21       0.00      0.00      0.00         1
          24       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [151]:
base3_MLP_emotions_model = base_MLP.fit(X3_emotions_train, y3_emotions_train)
y3_base_MLP_emotions_pred = base3_MLP_emotions_model.predict(X3_emotions_test)

base3_MLP_sentiments_model = base_MLP.fit(X3_sentiments_train, y3_sentiments_train)
y3_base_MLP_sentiments_pred = base3_MLP_sentiments_model.predict(X3_sentiments_test)

print("Random State = 1: Base-MLP with the default parameters")

# evaluate classifier
print("\nBase-MLP Emotions Classification Report:\n", classification_report(y3_emotions_test, y3_base_MLP_emotions_pred))
print("\nBase-MLP Sentiments Classification Report:\n", classification_report(y3_sentiments_test, y3_base_MLP_sentiments_pred))

# show confusion Matrix
print("\nBase-MLP Emotions Confusion Matrix:\n", confusion_matrix(y3_emotions_test, y3_base_MLP_emotions_pred))
print("\nBase-MLP Sentiments Confusion Matrix:\n", confusion_matrix(y3_sentiments_test, y3_base_MLP_sentiments_pred))

Random State = 1: Base-MLP with the default parameters

Base-MLP Emotions Classification Report:
               precision    recall  f1-score   support

           2       0.14      0.60      0.23         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       1.00      0.50      0.67         2
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          23       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.5.4 - Top MNB

In [152]:
top2_MNB_hyper_params = {
    # Because an alpha too small will result in numeric errors, 0 is set as 1.0e-10
    'alpha': [1.0e-10, 0.5, 1.5, 3.0]
}

print("Random State = 0: Top-MNB with the default parameters")

# FOR SAMPLE DATASET: Removes the frequencies of the emotions not found in the training set
sample_emotions_freq = []
for idx, freq in enumerate(emotion_freq):
    if list(set(y2_emotions_train)).count(idx) > 0:
        sample_emotions_freq.append(freq)

top2_MNB_emotions = MultinomialNB(class_prior=sample_emotions_freq)
top2_MNB_sentiments = MultinomialNB(class_prior=sentiment_freq)

top2_MNB_emotions_grid_search = GridSearchCV(estimator=top2_MNB_emotions, param_grid=top2_MNB_hyper_params)
top2_MNB_sentiments_grid_search = GridSearchCV(estimator=top2_MNB_sentiments, param_grid=top2_MNB_hyper_params)

top2_MNB_emotions_model = top2_MNB_emotions_grid_search.fit(X2_emotions_train, y2_emotions_train)
y2_top_MNB_emotions_pred = top2_MNB_emotions_model.predict(X2_emotions_test)
print('best params for emotions MNB: ', top_MNB_emotions_model.best_params_)

top2_MNB_sentiments_model = top2_MNB_sentiments_grid_search.fit(X2_sentiments_train, y2_sentiments_train)
y2_top_MNB_sentiments_pred = top2_MNB_sentiments_model.predict(X2_sentiments_test)
print('best params for sentiments MNB: ', top_MNB_sentiments_model.best_params_)

Random State = 0: Top-MNB with the default parameters
best params for emotions MNB:  {'alpha': 1e-10}
best params for sentiments MNB:  {'alpha': 3.0}


12 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 729, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 565, in _update_class_log_prior
    raise ValueError("Number of priors must match number of classes.")
ValueError: Number of priors must match number of classes.



In [153]:
top3_MNB_hyper_params = {
    # Because an alpha too small will result in numeric errors, 0 is set as 1.0e-10
    'alpha': [1.0e-10, 0.5, 1.5, 3.0]
}

print("Random State = 1: Top-MNB with the default parameters")

# FOR SAMPLE DATASET: Removes the frequencies of the emotions not found in the training set
sample_emotions_freq = []
for idx, freq in enumerate(emotion_freq):
    if list(set(y3_emotions_train)).count(idx) > 0:
        sample_emotions_freq.append(freq)

top3_MNB_emotions = MultinomialNB(class_prior=sample_emotions_freq)
top3_MNB_sentiments = MultinomialNB(class_prior=sentiment_freq)

top3_MNB_emotions_grid_search = GridSearchCV(estimator=top3_MNB_emotions, param_grid=top3_MNB_hyper_params)
top3_MNB_sentiments_grid_search = GridSearchCV(estimator=top3_MNB_sentiments, param_grid=top3_MNB_hyper_params)

top3_MNB_emotions_model = top3_MNB_emotions_grid_search.fit(X3_emotions_train, y3_emotions_train)
y3_top_MNB_emotions_pred = top3_MNB_emotions_model.predict(X3_emotions_test)
print('best params for emotions MNB: ', top_MNB_emotions_model.best_params_)

top3_MNB_sentiments_model = top3_MNB_sentiments_grid_search.fit(X3_sentiments_train, y3_sentiments_train)
y3_top_MNB_sentiments_pred = top3_MNB_sentiments_model.predict(X3_sentiments_test)
print('best params for sentiments MNB: ', top_MNB_sentiments_model.best_params_)

Random State = 1: Top-MNB with the default parameters
best params for emotions MNB:  {'alpha': 1e-10}
best params for sentiments MNB:  {'alpha': 3.0}


16 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 729, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 565, in _update_class_log_prior
    raise ValueError("Number of priors must match number of classes.")
ValueError: Number of priors must match number of classes.



### 2.5.5 - Top DT

In [154]:
top2_DT_hyper_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10],
    'min_samples_split': [2, 3, 4]
}

print("Random State = 0: Top-DT with the default parameters")

top2_DT = DecisionTreeClassifier()
top2_DT_grid_search = GridSearchCV(estimator=top_DT, param_grid=top2_DT_hyper_params)

top2_DT_emotions_model = top2_DT_grid_search.fit(X2_emotions_train, y2_emotions_train)
y2_top_DT_emotions_pred = top2_DT_emotions_model.predict(X2_emotions_test)

top2_DT_sentiments_model = top2_DT_grid_search.fit(X2_sentiments_train, y2_sentiments_train)
y2_top_DT_sentiments_pred = top2_DT_sentiments_model.predict(X2_sentiments_test)

Random State = 0: Top-DT with the default parameters




In [155]:
top3_DT_hyper_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10],
    'min_samples_split': [2, 3, 4]
}

print("Random State = 1: Top-DT with the default parameters")

top3_DT = DecisionTreeClassifier()
top3_DT_grid_search = GridSearchCV(estimator=top_DT, param_grid=top3_DT_hyper_params)

top3_DT_emotions_model = top3_DT_grid_search.fit(X3_emotions_train, y3_emotions_train)
y3_top_DT_emotions_pred = top3_DT_emotions_model.predict(X3_emotions_test)

top3_DT_sentiments_model = top3_DT_grid_search.fit(X3_sentiments_train, y3_sentiments_train)
y3_top_DT_sentiments_pred = top3_DT_sentiments_model.predict(X3_sentiments_test)

Random State = 1: Top-DT with the default parameters




### 2.5.6 - Top MLP

In [156]:
top2_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

print("Random State = 0: Top-MLP with the default parameters")

top2_MLP = MLPClassifier(max_iter=1) ### talk about low epochs in analysis
top2_MLP_grid_search = GridSearchCV(estimator=top_MLP, param_grid=top2_MLP_hyper_params)

top2_MLP_emotions_model = top2_MLP_grid_search.fit(X2_emotions_train, y2_emotions_train)
y2_top_MLP_emotions_pred = top2_MLP_emotions_model.predict(X2_emotions_test)

top2_MLP_sentiments_model = top2_MLP_grid_search.fit(X2_sentiments_train, y2_sentiments_train)
y2_top_MLP_sentiments_pred = top2_MLP_sentiments_model.predict(X2_sentiments_test)

Random State = 0: Top-MLP with the default parameters




In [157]:
top3_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

print("Random State = 1: Top-MLP with the default parameters")

top3_MLP = MLPClassifier(max_iter=1) ### talk about low epochs in analysis
top3_MLP_grid_search = GridSearchCV(estimator=top_MLP, param_grid=top3_MLP_hyper_params)

top3_MLP_emotions_model = top3_MLP_grid_search.fit(X3_emotions_train, y3_emotions_train)
y3_top_MLP_emotions_pred = top3_MLP_emotions_model.predict(X3_emotions_test)

top3_MLP_sentiments_model = top3_MLP_grid_search.fit(X3_sentiments_train, y3_sentiments_train)
y3_top_MLP_sentiments_pred = top3_MLP_sentiments_model.predict(X3_sentiments_test)



Random State = 1: Top-MLP with the default parameters




# 3. Embeddings as Features

## 3.1 - Load embedding model

In [158]:
GoogleNews = api.load("word2vec-google-news-300")

## 3.2 - Tokenizer

In [159]:
posts_train, posts_test, post_emotions_train, post_emotions_test = train_test_split(posts, y_emotions, test_size=0.2, random_state=0)
posts_train, posts_test, post_sentiments_train, post_sentiments_test = train_test_split(posts, y_sentiments, test_size=0.2, random_state=0)

tokens = []
for i in posts_train:
    post_tokens = word_tokenize(i)
    for token in post_tokens:
        tokens.append(token)

print('Size of training set vocabulary: ', len(tokens), 'tokens')

Size of training set vocabulary:  1687 tokens


## 3.3 - Average Embedding

In [160]:
post_embeddings = []

for post in posts:
    post_tokens = word_tokenize(post)
    word_embeddings = []

    for token in post_tokens:
        try:
            word_embedding = GoogleNews[token]
            word_embeddings.append(word_embedding)
        except KeyError: # If token is not present in Word2Vec model
            continue

    post_embedding = np.nanmean(word_embeddings)
    post_embeddings.append(post_embedding)

## 3.4 - Hit Rates

In [161]:
token_counter = 0
embedding_counter = 0

for post in posts_train:
    post_tokens = word_tokenize(post)
    word_embeddings = []
    
    for token in post_tokens:
        token_counter += 1
        try:
            word_embedding = GoogleNews[token]
            word_embeddings.append(word_embedding)
            embedding_counter += 1
        except KeyError: # If token is not present in Word2Vec model
            continue

train_hit_rate = 100 * (embedding_counter/token_counter)
print(train_hit_rate)

77.05986959098993


In [162]:
token_counter = 0
embedding_counter = 0

for post in posts_test:
    post_tokens = word_tokenize(post)
    word_embeddings = []
    
    for token in post_tokens:
        token_counter += 1
        try:
            word_embedding = GoogleNews[token]
            word_embeddings.append(word_embedding)
            embedding_counter += 1
        except KeyError: # If token is not present in Word2Vec model
            continue

test_hit_rate = 100 * (embedding_counter/token_counter)
print(test_hit_rate)

77.3067331670823


### Train_test_split of vector embeddings of words

In [163]:
post_embeddings = np.array(post_embeddings).reshape(-1, 1)

embeddings_train, embeddings_test, embedding_emotions_train, embedding_emotions_test = train_test_split(post_embeddings, y_emotions, test_size=0.2, random_state=0)
embeddings_train, embeddings_test, embedding_sentiments_train, embedding_sentiments_test = train_test_split(post_embeddings, y_sentiments, test_size=0.2, random_state=0)

## 3.5 - Base MLP

In [164]:
embedding_base_MLP = MLPClassifier(max_iter=1)

embedding_base_MLP_emotions_model = embedding_base_MLP.fit(embeddings_train, embedding_emotions_train)
with open('embedding_base_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(embedding_base_MLP_emotions_model, file)

embedding_base_MLP_sentiments_model = embedding_base_MLP.fit(embeddings_train, embedding_sentiments_train)
with open('embedding_base_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(embedding_base_MLP_sentiments_model, file)



In [165]:
with open('embedding_base_MLP_emotions_model.pkl', 'rb') as file:
    embedding_base_MLP_emotions_model = pickle.load(file)
    embedding_base_MLP_emotions_pred = embedding_base_MLP_emotions_model.predict(embeddings_test)

    # 3.6 Classification Performance
    f = open('embedding_base_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(embedding_base_MLP_emotions_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(embedding_emotions_test, embedding_base_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(embedding_emotions_test, embedding_base_MLP_emotions_pred)))
    f.close()

with open('embedding_base_MLP_sentiments_model.pkl', 'rb') as file:
    embedding_base_MLP_sentiments_model = pickle.load(file)
    embedding_base_MLP_sentiments_pred = embedding_base_MLP_sentiments_model.predict(embeddings_test)

    # 3.6 Classification Performance
    f = open('embedding_base_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(embedding_base_MLP_sentiments_model.get_params()))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(embedding_sentiments_test, embedding_base_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(embedding_sentiments_test, embedding_base_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3.6 - Top MLP

In [166]:
embedding_top_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

embedding_top_MLP = MLPClassifier(max_iter=1)
embedding_top_MLP_grid_search = GridSearchCV(estimator=top_MLP, param_grid=embedding_top_MLP_hyper_params)

embedding_top_MLP_emotions_model = embedding_top_MLP_grid_search.fit(embeddings_train, embedding_emotions_train)
with open('embedding_top_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(embedding_top_MLP_emotions_model, file)

embedding_top_MLP_sentiments_model = embedding_top_MLP_grid_search.fit(embeddings_train, embedding_sentiments_train)
with open('embedding_top_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(embedding_top_MLP_sentiments_model, file)



In [167]:
with open('embedding_top_MLP_emotions_model.pkl', 'rb') as file:
    embedding_top_MLP_emotions_model = pickle.load(file)
    embedding_top_MLP_emotions_pred = embedding_top_MLP_emotions_model.predict(embeddings_test)

    # 3.6 Classification Performance
    f = open('embedding_top_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(embedding_top_MLP_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(embedding_top_MLP_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(embedding_emotions_test, embedding_top_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(embedding_emotions_test, embedding_top_MLP_emotions_pred)))
    f.close()

with open('embedding_top_MLP_sentiments_model.pkl', 'rb') as file:
    embedding_top_MLP_sentiments_model = pickle.load(file)
    embedding_top_MLP_sentiments_pred = embedding_top_MLP_sentiments_model.predict(embeddings_test)

    # 3.6 Classification Performance
    f = open('embedding_top_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(embedding_top_MLP_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(embedding_top_MLP_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(embedding_sentiments_test, embedding_top_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(embedding_sentiments_test, embedding_top_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3.7 - Classification Performance

In [168]:
# See output above for 3.7

##  3.8 - Exploring other pretrained embedding models

### 3.8.1 - Loading embedding models

In [169]:
wiki = api.load("fasttext-wiki-news-subwords-300")
ruscorpora = api.load("word2vec-ruscorpora-300")

### 3.8.2 - Post embeddings using models

In [170]:
wiki_embeddings = []

for post in posts:
    post_tokens = word_tokenize(post)
    word_embeddings = []

    for token in post_tokens:
        try:
            word_embedding = wiki[token]
            word_embeddings.append(word_embedding)
        except KeyError: # If token is not present in Word2Vec model
            continue

    wiki_embedding = np.nanmean(word_embeddings)
    wiki_embeddings.append(wiki_embedding)

In [171]:
import math

ruscorpora_embeddings = []

for post in posts:
    post_tokens = word_tokenize(post)
    word_embeddings = []

    for token in post_tokens:
        try:
            word_embedding = ruscorpora[token]
            print(ruscorpora[token])
        except KeyError: # If token is not present in Word2Vec model
            continue

    ruscorpora_embedding = np.nanmean(word_embeddings)
    if not math.isnan(ruscorpora_embedding): ruscorpora_embeddings.append(ruscorpora_embedding)
    else: ruscorpora_embeddings.append(0)

  ruscorpora_embedding = np.nanmean(word_embeddings)


### 3.8.3 - Train-Test split of model embeddings

In [172]:
wiki_embeddings = np.array(wiki_embeddings).reshape(-1, 1)

wikis_train, wikis_test, wiki_emotions_train, wiki_emotions_test = train_test_split(wiki_embeddings, y_emotions, test_size=0.2, random_state=0)
wikis_train, wikis_test, wiki_sentiments_train, wiki_sentiments_test = train_test_split(wiki_embeddings, y_sentiments, test_size=0.2, random_state=0)

In [173]:
ruscorpora_embeddings = np.array(ruscorpora_embeddings).reshape(-1, 1)

rucorporas_train, ruscorporas_test, ruscorpora_emotions_train, ruscorpora_emotions_test = train_test_split(ruscorpora_embeddings, y_emotions, test_size=0.2, random_state=0)
ruscorporas_train, ruscorporas_test, ruscorpora_sentiments_train, ruscorpora_sentiments_test = train_test_split(ruscorpora_embeddings, y_sentiments, test_size=0.2, random_state=0)

### 3.8.4 - Top MLP Training using models

In [174]:
wiki_top_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

wiki_top_MLP = MLPClassifier(max_iter=1)
wiki_top_MLP_grid_search = GridSearchCV(estimator=wiki_top_MLP, param_grid=wiki_top_MLP_hyper_params)

wiki_top_MLP_emotions_model = wiki_top_MLP_grid_search.fit(wikis_train, wiki_emotions_train)
with open('wiki_top_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(wiki_top_MLP_emotions_model, file)

wiki_top_MLP_sentiments_model = wiki_top_MLP_grid_search.fit(wikis_train, wiki_sentiments_train)
with open('wiki_top_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(wiki_top_MLP_sentiments_model, file)



In [175]:
ruscorpora_top_MLP_hyper_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

ruscorpora_top_MLP = MLPClassifier(max_iter=1)
ruscorpora_top_MLP_grid_search = GridSearchCV(estimator=ruscorpora_top_MLP, param_grid=ruscorpora_top_MLP_hyper_params)

ruscorpora_top_MLP_emotions_model = ruscorpora_top_MLP_grid_search.fit(ruscorporas_train, ruscorpora_emotions_train)
with open('ruscorpora_top_MLP_emotions_model.pkl', 'wb') as file:
    pickle.dump(ruscorpora_top_MLP_emotions_model, file)

ruscorpora_top_MLP_sentiments_model = ruscorpora_top_MLP_grid_search.fit(ruscorporas_train, ruscorpora_sentiments_train)
with open('ruscorpora_top_MLP_sentiments_model.pkl', 'wb') as file:
    pickle.dump(ruscorpora_top_MLP_sentiments_model, file)



### 3.8.5 - MLP Performance of MLP using pretrained embedding models

In [176]:
with open('wiki_top_MLP_emotions_model.pkl', 'rb') as file:
    wiki_top_MLP_emotions_model = pickle.load(file)
    wiki_top_MLP_emotions_pred = wiki_top_MLP_emotions_model.predict(wikis_test)

    # 3.8 Classification Performance
    f = open('wiki_top_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(wiki_top_MLP_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(wiki_top_MLP_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(wiki_emotions_test, wiki_top_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(wiki_emotions_test, wiki_top_MLP_emotions_pred)))
    f.close()

with open('wiki_top_MLP_sentiments_model.pkl', 'rb') as file:
    wiki_top_MLP_sentiments_model = pickle.load(file)
    wiki_top_MLP_sentiments_pred = wiki_top_MLP_sentiments_model.predict(wikis_test)

    # 3.8 Classification Performance
    f = open('wiki_top_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(wiki_top_MLP_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(wiki_top_MLP_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(wiki_sentiments_test, wiki_top_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(wiki_sentiments_test, wiki_top_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [177]:
with open('ruscorpora_top_MLP_emotions_model.pkl', 'rb') as file:
    ruscorpora_top_MLP_emotions_model = pickle.load(file)
    ruscorpora_top_MLP_emotions_pred = ruscorpora_top_MLP_emotions_model.predict(ruscorporas_test)

    # 3.8 Classification Performance
    f = open('ruscorpora_top_MLP_emotions_performance.txt', 'w')
    f.write('Base MLP Performance for emotions\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(ruscorpora_top_MLP_emotions_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(ruscorpora_top_MLP_emotions_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(ruscorpora_emotions_test, ruscorpora_top_MLP_emotions_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(ruscorpora_emotions_test, ruscorpora_top_MLP_emotions_pred)))
    f.close()

with open('ruscorpora_top_MLP_sentiments_model.pkl', 'rb') as file:
    ruscorpora_top_MLP_sentiments_model = pickle.load(file)
    ruscorpora_top_MLP_sentiments_pred = ruscorpora_top_MLP_sentiments_model.predict(ruscorporas_test)

    # 3.8 Classification Performance
    f = open('ruscorpora_top_MLP_sentiments_performance.txt', 'w')
    f.write('Base MLP Performance for sentiments\n')
    f.write('---------------------------------\n')
    f.write('Hyper-parameters: ')
    f.write(str(ruscorpora_top_MLP_sentiments_model.get_params()))
    f.write('\n\nBest parameters: ')
    f.write(str(ruscorpora_top_MLP_sentiments_model.best_params_))
    f.write('\n\nConfusion matrix:\n')
    f.write(str(confusion_matrix(ruscorpora_sentiments_test, ruscorpora_top_MLP_sentiments_pred)))
    f.write('\n\nClassification report:\n')
    f.write(str(classification_report(ruscorpora_sentiments_test, ruscorpora_top_MLP_sentiments_pred)))
    f.close()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
