# Wheel of emotions

In [1]:
#import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import FastICA, KernelPCA, TruncatedSVD, SparsePCA, NMF, FactorAnalysis, LatentDirichletAllocation
from sklearn.model_selection import ShuffleSplit
from time import time
from collections import defaultdict

## Propose several models of classification of emotions and propose a qualitative and quantitative analysis of these models according to evaluation criteria.

In [2]:
#define stopwords and vectorizer
stopwords = nltk.corpus.stopwords.words('english')
vectoriser = CountVectorizer(ngram_range=(1,2), stop_words = stopwords )


In [3]:
#define all classification model
logreg = LogisticRegression(max_iter = 1000)
svclass = SVC()
sgdc = SGDClassifier(max_iter = 5000)
knn = KNeighborsClassifier(n_neighbors=10)
dtree = DecisionTreeClassifier(random_state=0)

#define fit and predict function
def fitting(X, y, mod):
    mod.fit(X, y)

def predict(X, mod):
    xx = mod.predict(X)
    return xx

## First have to work with the dataset from Kaggle to carry out your training and the evaluation of your models.

In [4]:
#import data
df = pd.read_csv("data/emotion_final.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21459 non-null  object
 1   Emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [6]:
#define x,y and clean data
x = np.array(df["Text"])
y = np.array(df["Emotion"])

x = vectoriser.fit_transform(x)

#define result dict

result = {}

### Logistic Regression

In [7]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#fit and predict
fitting(x_train, y_train, logreg)
ypred = predict(x_test, logreg)

In [8]:
logreg_f1 = f1_score(y_test, ypred, average="weighted")
logreg_recall = recall_score(y_test, ypred, average="weighted")
logreg_precision = precision_score(y_test, ypred, average="weighted")
result['logreg'] = logreg_f1, logreg_recall, logreg_precision

### SVC

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

fitting(x_train, y_train, svclass)
ypred = predict(x_test, svclass)

In [10]:
svclass_f1 = f1_score(y_test, ypred, average="weighted")
svclass_recall = recall_score(y_test, ypred, average="weighted")
svclass_precision = precision_score(y_test, ypred, average="weighted")
result['svc'] = svclass_f1, svclass_recall, svclass_precision

### SGD

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

fitting(x_train, y_train, sgdc)
ypred = predict(x_test, sgdc)

In [12]:
sgdc_f1 = f1_score(y_test, ypred, average="weighted")
sgdc_recall = recall_score(y_test, ypred, average="weighted")
sgdc_precision = precision_score(y_test, ypred, average="weighted")
result['sgdc'] = sgdc_f1, sgdc_recall, sgdc_precision

### KNN

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

fitting(x_train, y_train, knn)
ypred = predict(x_test, knn)

In [14]:
knn_f1 = f1_score(y_test, ypred, average="weighted")
knn_recall = recall_score(y_test, ypred, average="weighted")
knn_precision = precision_score(y_test, ypred, average="weighted")
result['knn'] = knn_f1, knn_recall, knn_precision

### Decision Tree

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

fitting(x_train, y_train, dtree)
ypred = predict(x_test, dtree)

In [16]:
dtree_f1 = f1_score(y_test, ypred, average="weighted")
dtree_recall = recall_score(y_test, ypred, average="weighted")
dtree_precision = precision_score(y_test, ypred, average="weighted")
result['dtree'] = dtree_f1, dtree_recall, dtree_precision

### Result - DF1

In [17]:
liste = []
values = []

for k, v in result.items():
    liste.append(k)
    values.append(v)
    
dfresult = pd.DataFrame(columns=['Model','F1_score - Recall - Precision'])

dfresult['Model'] = pd.Series(liste)
dfresult['F1_score - Recall - Precision'] = pd.Series(values)

dfresult.head()

Unnamed: 0,Model,F1_score - Recall - Precision
0,logreg,"(0.8991977253696037, 0.9007455731593662, 0.899..."
1,svc,"(0.7825518915306116, 0.7968313140726934, 0.821..."
2,sgdc,"(0.9050579787073314, 0.9061043802423113, 0.904..."
3,knn,"(0.45736459471976015, 0.4666821994408201, 0.52..."
4,dtree,"(0.8735854595419213, 0.8734855545200373, 0.874..."


## Introduce pipeline with more preproccessing

In [18]:
def run_pipes(pipes, splits=10, test_size=0.2, seed=0):  
    res = defaultdict(list)
    spliter = ShuffleSplit(n_splits=splits, test_size=test_size, random_state=seed)
    for idx_train, idx_test in spliter.split(corpus):
        for pipe in pipes:
            # name of the model
            name = "-".join([x[0] for x in pipe.steps])
            
            # extract datasets
            X_train = corpus[idx_train]
            X_test = corpus[idx_test]
            y_train = targets[idx_train]
            y_test = targets[idx_test]
            
            # Learn
            start = time()
            pipe.fit(X_train, y_train)
            fit_time = time() - start
            
            # predict and save results
            y = pipe.predict(X_test)
            res[name].append([
                fit_time,
                f1_score(y_test, y, average="weighted"),
                recall_score(y_test, y, average="weighted"),
                precision_score(y_test, y, average="weighted")
            ])
    return res

def print_table(res):
    # Compute mean and std
    final = {}
    for model in res:
        arr = np.array(res[model])
        final[model] = {
            "time" : arr[:, 0].mean().round(2),
            "f1": [arr[:,1].mean().round(3), arr[:,1].std().round(3)],
            "recall": [arr[:,2].mean().round(3), arr[:,1].std().round(3)],
            "precision": [arr[:,3].mean().round(3), arr[:,1].std().round(3)],}

    df = pd.DataFrame.from_dict(final, orient="index").round(3)
    return df

In [19]:
#import data
df = pd.read_csv("data/emotion_final.csv")

corpus = np.array(df['Text'])
targets = np.array(df['Emotion'])

## Logistic Regression

In [None]:
pipe00 = Pipeline([
    ('idf&lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('logreg', LogisticRegression(max_iter=5000, tol=1e-4)),
])
pipe001 = Pipeline([
    ('idf', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('logreg', LogisticRegression(max_iter=5000, tol=1e-4)),
])
pipe002 = Pipeline([
    ('lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),      
        ]))
    ])),
    ('logreg', LogisticRegression(max_iter=5000, tol=1e-4)),
])
pipe003 = Pipeline([
    ('only', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),         
        ]))
    ])),
    ('logreg', LogisticRegression(max_iter=5000, tol=1e-4)),
])
res = run_pipes([pipe00, pipe001, pipe002, pipe003])
print_table(res)

## SVC

In [None]:
pipe01 = Pipeline([
    ('idf&lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('svc', SVC(max_iter=5000, tol=1e-4)),
])
pipe011 = Pipeline([
    ('idf', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('svc', SVC(max_iter=5000, tol=1e-4)),
])
pipe012 = Pipeline([
    ('lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),     
        ]))
    ])),
    ('svc', SVC(max_iter=5000, tol=1e-4)),
])
pipe013 = Pipeline([
    ('only', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),       
        ]))
    ])),
    ('svc', SVC(max_iter=5000, tol=1e-4)),
])
res = run_pipes([pipe01, pipe011, pipe012, pipe013])
print_table(res)

## SGD Classifier

In [None]:
pipe02 = Pipeline([
    ('idf&lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('sgd', SGDClassifier(max_iter=5000, tol=1e-4)),
])
pipe021 = Pipeline([
    ('idf', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('sgd', SGDClassifier(max_iter=5000, tol=1e-4)),
])
pipe022 = Pipeline([
    ('lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
              ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),   
        ]))
    ])),
    ('sgd', SGDClassifier(max_iter=5000, tol=1e-4)),
])
pipe023 = Pipeline([
    ('only', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),   
        ]))
    ])),
    ('sgd', SGDClassifier(max_iter=5000, tol=1e-4)),
])
res = run_pipes([pipe02, pipe021, pipe022, pipe023])
print_table(res)

## KNN

In [None]:
pipe03 = Pipeline([
    ('idf&lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
])
pipe031 = Pipeline([
    ('idf', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
])
pipe032 = Pipeline([
    ('lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
        ]))
    ])),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
])
pipe033 = Pipeline([
    ('only', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),         
        ]))
    ])),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
])

res = run_pipes([pipe03, pipe031, pipe032, pipe033])
print_table(res)

## DTREE

In [None]:
pipe04 = Pipeline([
    ('idf&lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('dtree', DecisionTreeClassifier(max_iter=5000, tol=1e-4)),
])
pipe041 = Pipeline([
    ('idf', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),
                ('t', TfidfTransformer()),            
        ]))
    ])),
    ('dtree', DecisionTreeClassifier(max_iter=5000, tol=1e-4)),
])
pipe042 = Pipeline([
    ('lda', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
                ('lda', LatentDirichletAllocation(n_components=25)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),           
        ]))
    ])),
    ('dtree', DecisionTreeClassifier(max_iter=5000, tol=1e-4)),
])
pipe043 = Pipeline([
    ('only', FeatureUnion([
        ("decomposition", Pipeline([
                ("c", CountVectorizer(stop_words=stopwords, min_df=3)),
        ])),
        ("tfidf", Pipeline([
                ("c", CountVectorizer(ngram_range=(1,2))),            
        ]))
    ])),
    ('dtree', DecisionTreeClassifier(max_iter=5000, tol=1e-4)),
])
res = run_pipes([pipe04, pipe041, pipe042, pipe043])
print_table(res)

## In second have to work with the dataset from Data world to carry out your training and the evaluation of your models.

In [None]:
df2 = pd.read_csv("data/text_emotion.csv")

In [None]:
df2.head()

In [None]:
x2 = np.array(df2["content"])
y2 = np.array(df2["sentiment"])

x2 = vectoriser.fit_transform(x2)

result2 = {}

### Logistic Regression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

fitting(x_train, y_train, logreg)
ypred = predict(x_test, logreg)

In [None]:
logreg2_f1 = f1_score(y_test, ypred, average="weighted")
result2['logreg2_f1'] = logreg2_f1

### SVC

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

fitting(x_train, y_train, svclass)
ypred = predict(x_test, svclass)

In [None]:
svclass2_f1 = f1_score(y_test, ypred, average="weighted")
result2['svclass2_f1'] = svclass2_f1

### SGD

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

fitting(x_train, y_train, sgdc)
ypred = predict(x_test, sgdc)

In [None]:
sgdc2_f1 = f1_score(y_test, ypred, average="weighted")
result2['sgdc2_f1'] = sgdc2_f1

### KNN

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

fitting(x_train, y_train, knn)
ypred = predict(x_test, knn)

In [None]:
knn2_f1 = f1_score(y_test, ypred, average="weighted")
result2['knn2_f1'] = knn2_f1

### Decision Tree

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2, test_size=0.20, random_state=0)

fitting(x_train, y_train, dtree)
ypred = predict(x_test, dtree)

In [None]:
knn2_f1 = f1_score(y_test, ypred, average="weighted")
result2['knn2_f1'] = knn2_f1

## Analyse - dataframe 2

In [None]:
liste2 = []
values2 = []

for k, v in result2.items():
    liste2.append(k)
    values2.append(v)
    
dfresult2 = pd.DataFrame(columns=['Model','F1_score'])

dfresult2['Model'] = pd.Series(liste2)
dfresult2['F1_score'] = pd.Series(values2)

print(dfresult2)

## On the one hand, compare whether the classification results on your first dataset are similar with the second. Comment.


## Combine the two datasets to try to improve your prediction results.