# Imports

In [103]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score, classification_report

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [170]:
kaggle_ds = pd.read_csv("data/kaggle_dataset.csv")
mac_ds = pd.read_csv("data/merged_and_cleaned.csv")
mac_rm_ds = pd.read_csv("data/merged_and_cleaned_rm.csv")

# Model (with base dataset)

In [16]:
kaggle_ds

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
21454,Melissa stared at her friend in dism,fear
21455,Successive state elections have seen the gover...,fear
21456,Vincent was irritated but not dismay,fear
21457,Kendall-Hume turned back to face the dismayed ...,fear


In [162]:
X, y = kaggle_ds["Text"], kaggle_ds["Emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [148]:
import re

In [163]:
def tweet_tokenizer(text):
  text = re.sub(r'http\S+', '', text)
  tt = TweetTokenizer(reduce_len=True, strip_handles=True)
  return tt.tokenize(text)

In [169]:
model = Pipeline([
  ('count', CountVectorizer(stop_words=stopwords, tokenizer=tweet_tokenizer)),
  # ('tfid', TfidfVectorizer(stop_words=stopwords)),
  ('nb', MultinomialNB())
])

In [165]:
model.fit(X_train, y_train)



Pipeline(steps=[('count',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<__main__.LemmaTokenizer object at 0x1616c9f30>)),
                ('nb', MultinomialNB())])

In [167]:
y_pred = model.predict(X_test)
print(f"f1 score: {f1_score(y_test, y_pred, average='weighted')}")

f1 score: 0.7610864911639719


In [168]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.91      0.64      0.76       764
        fear       0.84      0.62      0.71       661
       happy       0.75      0.94      0.83      1715
        love       0.92      0.34      0.50       396
     sadness       0.76      0.94      0.84      1625
    surprise       0.86      0.12      0.21       204

    accuracy                           0.78      5365
   macro avg       0.84      0.60      0.64      5365
weighted avg       0.80      0.78      0.76      5365



# Model (with merged datasets)

## Merge and colums change

In [107]:
mac_ds

Unnamed: 0,Emotion,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happiness,wants to hang out with friends SOON!
4,fear,Re-pinging @ghostridah14: why didn't you go to...
...,...,...
51111,fear,Melissa stared at her friend in dism
51112,fear,Successive state elections have seen the gover...
51113,fear,Vincent was irritated but not dismay
51114,fear,Kendall-Hume turned back to face the dismayed ...


In [156]:
X, y = mac_ds["Text"], mac_ds["Emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [130]:
from nltk import word_tokenize

In [161]:
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in tweet_tokenizer(articles)]

In [158]:
model = Pipeline([
  ('count', CountVectorizer(stop_words=stopwords, tokenizer=LemmaTokenizer())),
  # ('tfid', TfidfVectorizer(stop_words=stopwords, tokenizer=tweet_tokenizer)),
  ('nb', MultinomialNB())
])

In [159]:
model.fit(X_train, y_train)



Pipeline(steps=[('count',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<__main__.LemmaTokenizer object at 0x1639df940>)),
                ('nb', MultinomialNB())])

In [160]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.88      0.36      0.51      1112
        fear       0.46      0.58      0.51      2805
   happiness       0.44      0.66      0.53      1882
       happy       0.69      0.90      0.78      1755
        love       0.61      0.22      0.33      1350
     sadness       0.58      0.60      0.59      3077
    surprise       0.59      0.03      0.05       798

    accuracy                           0.55     12779
   macro avg       0.61      0.48      0.47     12779
weighted avg       0.58      0.55      0.52     12779



## Merge and colums removal

In [175]:
X, y = mac_rm_ds["Text"], mac_rm_ds["Emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [182]:
model = Pipeline([
  ('count', CountVectorizer(stop_words=stopwords, tokenizer=LemmaTokenizer())),
  # ('tfid', TfidfVectorizer(stop_words=stopwords, tokenizer=tweet_tokenizer)),
  ('nb', MultinomialNB())
])

In [183]:
model.fit(X_train, y_train)



Pipeline(steps=[('count',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<__main__.LemmaTokenizer object at 0x16c50d3c0>)),
                ('nb', MultinomialNB())])

In [184]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.87      0.50      0.64       769
        fear       0.87      0.51      0.64       621
   happiness       0.48      0.65      0.55      1326
       happy       0.70      0.90      0.79      1792
        love       0.57      0.33      0.42      1375
     sadness       0.63      0.82      0.72      2842
    surprise       0.56      0.09      0.15       768

    accuracy                           0.63      9493
   macro avg       0.67      0.54      0.56      9493
weighted avg       0.64      0.63      0.61      9493

