# Wheel of emotions

In [1]:
#import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Propose several models of classification of emotions and propose a qualitative and quantitative analysis of these models according to evaluation criteria.

In [2]:
#define stopwords and vectorizer
stopwords = nltk.corpus.stopwords.words('english')
vectoriser = CountVectorizer(ngram_range=(1,2), stop_words = stopwords )


In [3]:
#define all classification model
logreg = LogisticRegression(max_iter = 1000)
svclass = SVC()
sgdc = SGDClassifier(max_iter = 5000)
knn = KNeighborsClassifier(n_neighbors=10)
dtree = DecisionTreeClassifier(random_state=0)

#define fit and predict function
def fitting(X, y, mod):
    mod.fit(X, y)

def predict(X, mod):
    xx = mod.predict(X)
    return xx

## First have to work with the dataset from Kaggle to carry out your training and the evaluation of your models.

In [4]:
#import data
df = pd.read_csv("data/emotion_final.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21459 non-null  object
 1   Emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [6]:
#define x,y and clean data
x = np.array(df["Text"])
y = np.array(df["Emotion"])

x = vectoriser.fit_transform(x)

### Logistic Regression

In [7]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, logreg)

ypred = predict(x_test, logreg)

In [8]:
#print classification report
logreg_report = classification_report(y_test, ypred)
print(logreg_report)

              precision    recall  f1-score   support

       anger       0.90      0.89      0.89       600
        fear       0.89      0.82      0.85       526
       happy       0.90      0.95      0.93      1399
        love       0.85      0.77      0.81       325
     sadness       0.92      0.95      0.94      1258
    surprise       0.82      0.67      0.74       184

    accuracy                           0.90      4292
   macro avg       0.88      0.84      0.86      4292
weighted avg       0.90      0.90      0.90      4292



### SVC

In [9]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, svclass)

ypred = predict(x_test, svclass)

In [10]:
#print classification report
svclass_report = classification_report(y_test, ypred)
print(svclass_report)

              precision    recall  f1-score   support

       anger       0.92      0.63      0.75       600
        fear       0.86      0.62      0.72       526
       happy       0.72      0.97      0.83      1399
        love       0.92      0.40      0.56       325
     sadness       0.82      0.92      0.87      1258
    surprise       0.96      0.36      0.52       184

    accuracy                           0.80      4292
   macro avg       0.87      0.65      0.71      4292
weighted avg       0.82      0.80      0.78      4292



### SGD

In [11]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, sgdc)

ypred = predict(x_test, sgdc)

In [12]:
#print classification report
sgdc_report = classification_report(y_test, ypred)
print(sgdc_report)

              precision    recall  f1-score   support

       anger       0.91      0.90      0.90       600
        fear       0.89      0.85      0.87       526
       happy       0.93      0.93      0.93      1399
        love       0.81      0.81      0.81       325
     sadness       0.92      0.95      0.94      1258
    surprise       0.85      0.72      0.78       184

    accuracy                           0.91      4292
   macro avg       0.88      0.86      0.87      4292
weighted avg       0.91      0.91      0.91      4292



### KNN

In [13]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, knn)

ypred = predict(x_test, knn)

In [14]:
#print classification report
knn_report = classification_report(y_test, ypred)
print(knn_report)

              precision    recall  f1-score   support

       anger       0.26      0.64      0.37       600
        fear       0.62      0.32      0.42       526
       happy       0.58      0.57      0.57      1399
        love       0.54      0.07      0.12       325
     sadness       0.56      0.50      0.53      1258
    surprise       0.53      0.05      0.09       184

    accuracy                           0.47      4292
   macro avg       0.51      0.36      0.35      4292
weighted avg       0.53      0.47      0.46      4292



### Decision Tree

In [15]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, dtree)

ypred = predict(x_test, dtree)

In [16]:
#print classification report
dtree_report = classification_report(y_test, ypred)
print(dtree_report)

              precision    recall  f1-score   support

       anger       0.83      0.89      0.86       600
        fear       0.84      0.84      0.84       526
       happy       0.90      0.90      0.90      1399
        love       0.79      0.79      0.79       325
     sadness       0.92      0.89      0.90      1258
    surprise       0.77      0.74      0.76       184

    accuracy                           0.87      4292
   macro avg       0.84      0.84      0.84      4292
weighted avg       0.87      0.87      0.87      4292



## Analyse - dataframe 1

## In second have to work with the dataset from Data world to carry out your training and the evaluation of your models.

In [17]:
#import data
df2 = pd.read_csv("data/text_emotion.csv")

In [18]:
df2.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [19]:
#define x,y and clean data
x2 = np.array(df2["content"])
y2 = np.array(df2["sentiment"])

x2 = vectoriser.fit_transform(x2)

### Logistic Regression

In [20]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, logreg)

ypred = predict(x_test, logreg)

In [21]:
#print classification report
logreg_report2 = classification_report(y_test, ypred)
print(logreg_report2)

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        37
       empty       0.10      0.01      0.01       147
  enthusiasm       0.00      0.00      0.00       147
         fun       0.21      0.06      0.09       354
   happiness       0.32      0.32      0.32      1052
        hate       0.33      0.13      0.18       251
        love       0.51      0.40      0.45       780
     neutral       0.34      0.59      0.43      1743
      relief       0.19      0.05      0.08       292
     sadness       0.34      0.27      0.30      1062
    surprise       0.16      0.04      0.06       422
       worry       0.33      0.40      0.36      1694

    accuracy                           0.34      8000
   macro avg       0.22      0.17      0.18      8000
weighted avg       0.32      0.34      0.31      8000



  _warn_prf(average, modifier, msg_start, len(result))


### SVC

In [22]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, svclass)

ypred = predict(x_test, svclass)

In [23]:
#print classification report
svclass_report2 = classification_report(y_test, ypred)
print(svclass_report2)

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        37
       empty       0.00      0.00      0.00       147
  enthusiasm       0.00      0.00      0.00       147
         fun       0.00      0.00      0.00       354
   happiness       0.34      0.34      0.34      1052
        hate       0.40      0.08      0.13       251
        love       0.57      0.35      0.43       780
     neutral       0.33      0.61      0.43      1743
      relief       0.25      0.01      0.01       292
     sadness       0.42      0.13      0.20      1062
    surprise       0.27      0.01      0.02       422
       worry       0.30      0.50      0.37      1694

    accuracy                           0.34      8000
   macro avg       0.22      0.16      0.15      8000
weighted avg       0.33      0.34      0.29      8000



### SGD

In [24]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, sgdc)

ypred = predict(x_test, sgdc)

In [25]:
#print classification report
sgdc_report2 = classification_report(y_test, ypred)
print(sgdc_report2)

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        37
       empty       0.04      0.01      0.01       147
  enthusiasm       0.05      0.01      0.01       147
         fun       0.16      0.05      0.08       354
   happiness       0.30      0.31      0.31      1052
        hate       0.29      0.15      0.20       251
        love       0.47      0.41      0.44       780
     neutral       0.34      0.55      0.42      1743
      relief       0.18      0.06      0.09       292
     sadness       0.31      0.27      0.29      1062
    surprise       0.11      0.03      0.05       422
       worry       0.32      0.37      0.35      1694

    accuracy                           0.33      8000
   macro avg       0.20      0.17      0.17      8000
weighted avg       0.30      0.33      0.30      8000



### KNN

In [26]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, knn)

ypred = predict(x_test, knn)

In [27]:
#print classification report
knn_report2 = classification_report(y_test, ypred)
print(knn_report2)

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        37
       empty       0.07      0.01      0.02       147
  enthusiasm       0.00      0.00      0.00       147
         fun       0.33      0.01      0.01       354
   happiness       0.24      0.06      0.10      1052
        hate       0.00      0.00      0.00       251
        love       0.45      0.12      0.19       780
     neutral       0.23      0.81      0.36      1743
      relief       0.00      0.00      0.00       292
     sadness       0.28      0.04      0.07      1062
    surprise       0.00      0.00      0.00       422
       worry       0.23      0.17      0.20      1694

    accuracy                           0.24      8000
   macro avg       0.14      0.09      0.07      8000
weighted avg       0.23      0.24      0.16      8000



  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [28]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, dtree)

ypred = predict(x_test, dtree)

In [29]:
#print classification report
dtree_report2 = classification_report(y_test, ypred)
print(dtree_report2)

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.09      0.03      0.04        37
       empty       0.03      0.03      0.03       147
  enthusiasm       0.04      0.01      0.02       147
         fun       0.13      0.07      0.09       354
   happiness       0.25      0.28      0.27      1052
        hate       0.21      0.16      0.18       251
        love       0.39      0.34      0.36       780
     neutral       0.34      0.47      0.39      1743
      relief       0.10      0.07      0.08       292
     sadness       0.30      0.23      0.26      1062
    surprise       0.09      0.05      0.06       422
       worry       0.30      0.34      0.32      1694

    accuracy                           0.29      8000
   macro avg       0.17      0.16      0.16      8000
weighted avg       0.27      0.29      0.28      8000



## Analyse - dataframe 2

## On the one hand, compare whether the classification results on your first dataset are similar with the second. Comment.