In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

np.set_printoptions(linewidth=np.inf)

In [2]:
def parse_data(tweets: list[str]) -> dict[str, int]:
    corpus: list[dict] = []
    for tweet in tweets:
        current_dict: dict[str, int] = {}
        for word in tweet.split():
            word = word.lower()
            if word in current_dict:
                current_dict[word] += 1
            else:
                current_dict[word] = 1
        corpus.append(current_dict)
        
    return corpus

In [3]:
def highlight_diag(df):
    a = np.full(df.shape, '', dtype='<U24')
    np.fill_diagonal(a, 'background-color: blue')
    return pd.DataFrame(a, index=df.index, columns=df.columns)

In [4]:
df = pd.read_csv('../../datasets/tweet_emotions.csv')
print(df.dtypes)
# print(len(df.loc[df['sentiment'] == 'neutral']))

tweet_id      int64
sentiment    object
content      object
dtype: object


In [5]:
df.drop('tweet_id', axis=1, inplace=True)
df.dtypes

sentiment    object
content      object
dtype: object

In [6]:
unique_emotions = df['sentiment'].unique()
print(df.dtypes)
print('\nEmotion:\t count:')
print(df['sentiment'].value_counts())

sentiment    object
content      object
dtype: object

Emotion:	 count:
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64


In [7]:
print('Null values:\n', df.isnull().sum(), sep='')

Null values:
sentiment    0
content      0
dtype: int64


In [8]:
print('Duplicates:\n', df.duplicated().sum(), sep='')

Duplicates:
91


In [9]:
df.drop_duplicates(inplace=True)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer

In [11]:
X = df.loc[:, 'content']
y = df.loc[:, 'sentiment']

In [12]:
dv = DictVectorizer()

In [13]:
# X = dv.fit_transform(parse_data(X))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
model = MultinomialNB()

In [31]:
X_train_dv = dv.fit_transform(parse_data(X_train))

In [32]:
model.fit(X_train_dv, y_train)

In [33]:
y_train_pred = model.predict(X_train_dv)

In [34]:
accuracy_score(y_train, y_train_pred)

0.5854292605005168

In [35]:
confusion_matrix(y_train, y_train_pred)

array([[   0,    0,    0,    0,    0,    4,    0,    0,   18,    0,    8,    0,   54],
       [   0,    1,    0,    0,    0,    0,    0,    1,   27,    0,   10,    0,  102],
       [   0,    0,   25,    0,    0,   37,    0,    5,  190,    0,   34,    0,  348],
       [   0,    0,    0,   17,    0,   59,    0,   14,  172,    0,   15,    0,  337],
       [   0,    0,    0,    0,  108,  210,    0,   26,  335,    0,   35,    1,  695],
       [   0,    0,    0,    0,    0, 2731,    0,  100,  318,    0,   21,    0,  959],
       [   0,    0,    0,    0,    1,   21,   66,    4,  119,    0,   86,    0,  781],
       [   0,    0,    0,    0,    1,  212,    0, 1585,  289,    0,   37,    0,  936],
       [   0,    0,    0,    0,    0,  144,    0,   93, 5333,    0,   44,    0, 1290],
       [   0,    0,    0,    0,    0,  127,    0,   37,  268,   54,   27,    0,  702],
       [   0,    0,    1,    0,    0,   40,    0,   17,  236,    0, 2040,    0, 1768],
       [   0,    0,    0,    0,    0,  139,

In [36]:
X_test_dv = dv.transform(parse_data(X_test))
y_test_pred = model.predict(X_test_dv)
accuracy_score(y_test, y_test_pred)

0.2912803808569281

In [37]:
cm = confusion_matrix(y_test, y_test_pred, labels=unique_emotions)
# pd.DataFrame(cm, index=unique_emotions, columns=unique_emotions)
colored_df = pd.DataFrame(cm, index=unique_emotions, columns=unique_emotions)
colored_df.style.apply(highlight_diag, axis=None)

Unnamed: 0,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
empty,0,7,0,55,114,0,2,1,0,9,0,0,0
sadness,0,69,0,110,859,0,1,0,0,19,0,0,0
enthusiasm,0,2,0,40,88,0,1,0,0,14,0,0,0
neutral,2,43,0,545,984,0,37,0,1,101,0,0,0
worry,0,57,0,246,1311,0,17,0,0,42,0,0,0
surprise,0,6,0,101,260,0,11,0,0,37,0,0,0
love,0,10,0,138,330,0,149,0,0,114,0,0,0
fun,0,13,0,110,169,0,8,1,0,65,0,0,0
hate,0,16,0,25,200,0,1,0,1,2,0,0,0
happiness,0,14,0,257,506,0,39,0,0,249,0,0,0
