# Mini Project 1, team Kool Kids

# 1. Dataset Preparation & Analysis

### 1.1 Load Dataset

In [None]:
import gzip
import json
import pandas as pd

labels = 'Post', 'Emotion', 'Sentiment'
file = gzip.open('goemotions.json.gz', 'rb')
entries = json.load(file)
dataset = pd.DataFrame(entries, columns=labels)

### 1.3 Extract posts and labels in a plot

##### data extraction

In [1]:
posts = dataset[labels[0]]
emotions = dataset[labels[1]]
sentiments = dataset[labels[2]]
emotions_distribution = dataset.pivot_table(columns=labels[1], aggfunc='size')
sentiments_distribution = dataset.pivot_table(columns=labels[2], aggfunc='size')


NameError: name 'dataset' is not defined

##### plotting the distributions

In [None]:
import matplotlib.pyplot as plt

fig, (emo, sent) = plt.subplots(1, 2)

# Emotions Pie Chart
emo.set_title("Emotions")
emo.pie(emotions_distribution.values, labels=emotions_distribution.keys(),
            shadow=False, startangle=90, rotatelabels=True)

# Sentiment Pie Chart
sent.set_title("Sentiment")
sent.pie(
    sentiments_distribution.values, labels=sentiments_distribution.keys(), 
    autopct='%1.1f%%', shadow=False, startangle=90)
plt.show()

# 2. Words as Features

### 2.1 Displaying dataset tokens

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

post_vectorizer = CountVectorizer()
X = post_vectorizer.fit_transform(dataset[labels[0]])
print('There are', X.shape[1], 'unique tokens')

emotions_label_encoder = LabelEncoder()
Y = emotions_label_encoder.fit_transform(emotions)

sentiments_label_encoder = LabelEncoder()
Z = sentiments_label_encoder.fit_transform(sentiments)

### 2.2 Splitting the dataset

In [None]:
from sklearn.model_selection.tests import test_split

post_train, post_test, emotion_train, emotion_test, sentiment_train, sentiment_test = test_split.train_test_split(
    X,
    Y,
    Z,
    test_size=0.20)

### 2.3 Classifier Training / Testing

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#### 2.3.1 Base-MNB

In [None]:
base_mnb_emotion = MultinomialNB()
base_mnb_emotion.fit(post_train, emotion_train)

base_mnb_sentiment = MultinomialNB()
base_mnb_sentiment.fit(post_train, sentiment_train)

#### 2.3.2 Base-DT

In [None]:
base_dt_emotion = DecisionTreeClassifier()
base_dt_emotion.fit(post_train, emotion_train)

base_dt_sentiment = DecisionTreeClassifier()
base_dt_sentiment.fit(post_train, sentiment_train)

#### 2.3.3 Base-MLP

In [None]:
base_mlp_emotion = MLPClassifier(max_iter=1)
print(base_mlp_emotion.fit(post_train, emotion_train))


base_mlp_sentiment = MLPClassifier(max_iter=1)
print(base_mlp_sentiment.fit(post_train, sentiment_train))


#### 2.3.4 Top-MNB

In [None]:
param_grid = {'alpha': [0, 0.25, 0.5, 0.75]}
top_mnb_emotion = GridSearchCV(MultinomialNB(), param_grid)
top_mnb_emotion.fit(post_train, emotion_train)

top_mnb_sentiment = GridSearchCV(MultinomialNB(), param_grid)
top_mnb_sentiment.fit(post_train, sentiment_train)


#### 2.3.5 Top-DT

In [None]:
param_grid = {
        'criterion': ['entropy'],
        'max_depth': [2, 8],
        'min_samples_split': [2, 4, 6]
    }
top_dt_emotion = GridSearchCV(DecisionTreeClassifier(), param_grid)
top_dt_emotion.fit(post_train, emotion_train)

top_dt_sentiment = GridSearchCV(DecisionTreeClassifier(), param_grid)
top_dt_sentiment.fit(post_train, sentiment_train)

#### 2.3.6 Top-MLP

In [None]:
param_grid = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30,50),(10,10,10)],
    'solver': ['adam', 'sgd']
}

top_mlp_emotion = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
top_mlp_emotion.fit(post_train, emotion_train)


top_mlp_sentiment = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
top_mlp_sentiment.fit(post_train, sentiment_train)

### 2.4 Performance

#### Classification report

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('Base MNB')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_mnb_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_mnb_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_mnb_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_mnb_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Base DT')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_dt_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_dt_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_dt_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_dt_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Base MLP')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_mlp_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_mlp_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_mlp_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_mlp_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top MNB')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_mnb_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_mnb_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_mnb_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_mnb_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top DT')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_dt_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_dt_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_dt_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_dt_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top MLP')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_mlp_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_mlp_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment',)
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_mlp_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_mlp_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

### 2.5 Exploration (Removing Stop Words)

In [None]:
post_vectorizer = CountVectorizer(stop_words='english')
X = post_vectorizer.fit_transform(dataset[labels[0]])
print('There are', X.shape[1], 'unique tokens')

#### 2.5.1 Base-MNB

In [None]:
base_mnb_emotion = MultinomialNB()
base_mnb_emotion.fit(post_train, emotion_train)

base_mnb_sentiment = MultinomialNB()
base_mnb_sentiment.fit(post_train, sentiment_train)

#### 2.5.2 Base-DT

In [None]:
base_dt_emotion = DecisionTreeClassifier()
base_dt_emotion.fit(post_train, emotion_train)

base_dt_sentiment = DecisionTreeClassifier()
base_dt_sentiment.fit(post_train, sentiment_train)

#### 2.5.3 Base-MLP

In [None]:
base_mlp_emotion = MLPClassifier(max_iter=1)
print(base_mlp_emotion.fit(post_train, emotion_train))


base_mlp_sentiment = MLPClassifier(max_iter=1)
print(base_mlp_sentiment.fit(post_train, sentiment_train))


#### 2.5.4 Top-MNB

In [None]:
param_grid = {'alpha': [0, 0.25, 0.5, 0.75]}
top_mnb_emotion = GridSearchCV(MultinomialNB(), param_grid)
top_mnb_emotion.fit(post_train, emotion_train)

top_mnb_sentiment = GridSearchCV(MultinomialNB(), param_grid)
top_mnb_sentiment.fit(post_train, sentiment_train)


#### 2.5.5 Top-DT

In [None]:
param_grid = {
        'criterion': ['entropy'],
        'max_depth': [2, 8],
        'min_samples_split': [2, 4, 6]
    }
top_dt_emotion = GridSearchCV(DecisionTreeClassifier(), param_grid)
top_dt_emotion.fit(post_train, emotion_train)

top_dt_sentiment = GridSearchCV(DecisionTreeClassifier(), param_grid)
top_dt_sentiment.fit(post_train, sentiment_train)

#### 2.5.6 Top-MLP

In [None]:
param_grid = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(30,50),(10,10,10)],
    'solver': ['adam', 'sgd']
}

top_mlp_emotion = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
top_mlp_emotion.fit(post_train, emotion_train)


top_mlp_sentiment = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
top_mlp_sentiment.fit(post_train, sentiment_train)

#### Classification report

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


print('Base MNB')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_mnb_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_mnb_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_mnb_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_mnb_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Base DT')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_dt_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_dt_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_dt_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_dt_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Base MLP')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, base_mlp_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, base_mlp_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, base_mlp_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, base_mlp_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top MNB')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_mnb_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_mnb_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_mnb_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_mnb_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top DT')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_dt_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_dt_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_dt_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_dt_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

print('Top MLP')
print('Emotion')
print('Confusion Matrix \n')
print(confusion_matrix(emotion_test, top_mlp_emotion.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(emotion_test, top_mlp_emotion.predict(post_test), target_names=emotions_label_encoder.classes_))

print('Sentiment',)
print('Confusion Matrix \n')
print(confusion_matrix(sentiment_test, top_mlp_sentiment.predict(post_test)))
print('\n Classification Report \n')
print(classification_report(sentiment_test, top_mlp_sentiment.predict(post_test), target_names=sentiments_label_encoder.classes_))

## 3 Embedding as Features

### 3.1 Word2Vec import

In [None]:
from gensim.downloader import load


model = load('word2vec-google-news-300')

### 3.2 Tokenizer

In [None]:
from nltk.tokenize import word_tokenize

postsVec = dataset[labels[0]].apply(word_tokenize)
posts_vec_train, posts_vec_test = test_split.train_test_split(postsVec, test_size=0.20)

words_train = 0

for i, post in enumerate(posts_vec_train):
    for word in post:
        words_train = words_train + 1


In [None]:
print('There are', len(posts_vec_train), 'sentences')
print('There are', words_train, 'tokens')
print('Training only')

### 3.3 Post Embedding

In [None]:
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [None]:
post_embedding_train = np.zeros((len(posts_vec_train), model.vector_size))
for i, post in enumerate(posts_vec_train):
    post_vec = np.zeros((model.vector_size,))
    words = 0
    for word in post:
        if word in model:
            words = words + 1
            post_vec = np.add(post_vec, model[word])            
    if words == 0:
        words = 1
    post_embedding_train[i] = np.divide(post_vec, words)
    
post_embedding_test = np.zeros((len(posts_vec_test), model.vector_size))
for i, post in enumerate(posts_vec_test):
    post_vec = np.zeros((model.vector_size,))
    words = 0
    for word in post:
        if word in model:
            words = words + 1
            post_vec = np.add(post_vec, model[word])
    if words == 0:
        words = 1    
    post_embedding_test[i] = np.divide(post_vec, words)

In [None]:
print(post_embedding_train)

### 3.4 Hit Rate

In [None]:
hits = 0
total_words = 0
for i, post in enumerate(posts_vec_train):
    for word in post:
        total_words = total_words + 1
        if word in model:
            hits = hits + 1
print('Training hit rate', hits / total_words)

hits = 0
total_words = 0
for i, post in enumerate(posts_vec_test):
    for word in post:
        total_words = total_words + 1
        if word in model:
            hits = hits + 1
print('Testing hit rate', hits / total_words)

### 3.5 Base-MLP

In [None]:
w2v_base_mlp_emotion = MLPClassifier()
w2v_base_mlp_emotion.fit(post_embedding_train, emotion_train)

w2v_base_mlp_sentiment = MLPClassifier()
w2v_base_mlp_sentiment.fit(post_embedding_train, sentiment_train)

### 3.6 Top-MLP

In [None]:
param_grid = {
        'activation': ['logistic', 'tanh', 'relu', 'identity'],
        'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
        'solver': ['adam', 'sgd']
    }
w2v_top_mlp_emotion = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
w2v_top_mlp_emotion.fit(post_embedding_train, emotion_train)

w2v_top_mlp_sentiment = GridSearchCV(MLPClassifier(max_iter=1), param_grid)
w2v_top_mlp_sentiment.fit(post_embedding_train, sentiment_train)

###  3.7 Performance

Classification Report

In [None]:
from sklearn.metrics import classification_report

print('Base MLP')
print('Emotion')
print('\n Classification Report \n')
print(classification_report(emotion_test, w2v_base_mlp_emotion.predict(post_embedding_test), target_names=emotions_label_encoder.classes_))

print('Sentiment')
print('\n Classification Report \n')
print(classification_report(sentiment_test, w2v_base_mlp_sentiment.predict(post_embedding_test), target_names=sentiments_label_encoder.classes_))

print('Top MLP')
print('Emotion')
print('\n Classification Report \n')
print(classification_report(emotion_test, w2v_top_mlp_emotion.predict(post_embedding_test), target_names=emotions_label_encoder.classes_))

print('Sentiment',)
print('\n Classification Report \n')
print(classification_report(sentiment_test, w2v_top_mlp_sentiment.predict(post_embedding_test), target_names=sentiments_label_encoder.classes_))