In [1]:
# Prepare dataset
import pandas as pd

csv1 = pd.read_csv("emotion-labels-test.csv")
csv2 = pd.read_csv("emotion-labels-train.csv")
csv3 = pd.read_csv("emotion-labels-val.csv")

dataframe = pd.concat( [csv1, csv2, csv3], ignore_index = True)

In [2]:
### Text Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
stopwords = stopwords.words("english")

for i in range(len(dataframe)):
  # Remove non-letters
  phrase = re.sub( "[^a-zA-Z]", " ", dataframe.loc[i, 'text']).lower().split()

  # Remove suffixes and Remove stop words that are not helpful
  stemmer = PorterStemmer()
  phrase = [ stemmer.stem(word) for word in phrase if word not in stopwords]
  phrase = " ".join(phrase)

  corpus.append( phrase )

In [3]:
### Create Bag of Words [Matrix of numbers which represnt the count of existing words in each phrase]
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer( )
features = vectorizer.fit_transform( corpus ).toarray()
label = dataframe["label"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.2)

In [4]:
### Import evaluaters
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

In [10]:
### Build Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()
logistic_reg.fit( X_train, y_train )
prediction = logistic_reg.predict( X_test )
LR_matrix = confusion_matrix( y_test, prediction )
LR_accuracy = accuracy_score( y_test, prediction )
LR_score = cross_val_score( estimator = logistic_reg, X = X_train, y = y_train, cv = 10)

print(f"Logistic Regression\n{LR_matrix}\nAccuracy: {LR_accuracy}\nScore: {LR_score}\nMean Score: {LR_score.mean()}")

Logistic Regression
[[292  21   3  21]
 [ 11 406   5  22]
 [  9  19 294   8]
 [ 17  40  14 239]]
Accuracy: 0.8662913441238564
Score: [0.82952548 0.82570423 0.85739437 0.85211268 0.83802817 0.86267606
 0.87147887 0.875      0.87852113 0.85915493]
Mean Score: 0.8549595905839252


In [5]:
### K-Nearest Neighbors 
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier( n_neighbors = 5)
KNN.fit( X_train, y_train )

KNN_prediction = KNN.predict( X_test )
KNN_matrix = confusion_matrix( y_test, KNN_prediction )
KNN_accuracy = accuracy_score( y_test, KNN_prediction )
KNN_score = cross_val_score( estimator = KNN, X = X_train, y = y_train, cv = 10)

print(f"K-Nearest Neighbors \n{KNN_matrix}\nAccuracy: {KNN_accuracy}\nScore: {KNN_score}\nMean Score: {KNN_score.mean()}")

K-Nearest Neighbors 
[[226  80  14  17]
 [ 67 350   9  18]
 [ 47 100 169  14]
 [ 59 124  30  97]]
Accuracy: 0.5925404644616468
Score: [0.56766257 0.54753521 0.55633803 0.58274648 0.56514085 0.57570423
 0.55985915 0.5334507  0.60915493 0.58802817]
Mean Score: 0.568562031238397


In [8]:
### Support Vector
from sklearn.svm import SVC
SVC = SVC( kernel = 'rbf')
SVC.fit( X_train, y_train )
SVC_prediction = SVC.predict( X_test )

SVC_matrix = confusion_matrix( y_test, SVC_prediction )
SVC_accuracy = accuracy_score( y_test, SVC_prediction )
# SVC_score = cross_val_score( estimator = SVC, X = X_train, y = y_train, cv = 3)

print(f"SVM \n{SVC_matrix}\nAccuracy: {SVC_accuracy}")

K-Nearest Neighbors 
[[279  33   3  22]
 [  4 415   2  23]
 [  5  39 278   8]
 [ 16  49   8 237]]
Accuracy: 0.8508092892329345


In [6]:
### Random Forest 
from sklearn.ensemble import RandomForestClassifier
FOREST = RandomForestClassifier( n_estimators = 100 )
FOREST.fit( X_train, y_train )
FOREST_prediction = FOREST.predict( X_test )

FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
FOREST_score = cross_val_score( estimator = FOREST, X = X_train, y = y_train, cv = 10)

print(f"Random Forest \n{FOREST_matrix}\nAccuracy: {FOREST_accuracy}\nScore: {FOREST_score}\nMean Score: {FOREST_score.mean()}")

K-Nearest Neighbors 
[[275  42   3  17]
 [  6 411   3  24]
 [  1  33 290   6]
 [ 20  49  14 227]]
Accuracy: 0.8465869106263195
Score: [0.84358524 0.82570423 0.86267606 0.85915493 0.85387324 0.83450704
 0.85739437 0.86091549 0.87323944 0.8415493 ]
Mean Score: 0.851259932176539


In [7]:
### Naive Bayes
from sklearn.naive_bayes import GaussianNB
NAIVE_bayes = GaussianNB()
NAIVE_bayes.fit( X_train, y_train )
NB_prediction = NAIVE_bayes.predict( X_test )

NB_matrix = confusion_matrix( y_test, NB_prediction )
NB_accuracy = accuracy_score( y_test, NB_prediction )
NB_score = cross_val_score( estimator = NAIVE_bayes, X = X_train, y = y_train, cv = 10)

print(f"Naive Bayes \n{NB_matrix}\nAccuracy: {NB_accuracy}\nScore: {NB_score}\nMean Score: {NB_score.mean()}")

K-Nearest Neighbors 
[[195  24  57  61]
 [ 33 240  70 101]
 [ 26  18 244  42]
 [ 44  40  66 160]]
Accuracy: 0.5904292751583392
Score: [0.56414763 0.6056338  0.59683099 0.60387324 0.56866197 0.58274648
 0.61267606 0.58978873 0.56690141 0.63028169]
Mean Score: 0.5921541993613704


In [37]:
# Maybe vectorized phrase is wrong ...

test_phrase = "Happy"
vectorized_phrase = vectorizer.transform( [test_phrase] ).toarray()
print(vectorized_phrase)
print(f"Logistic Regression: {logistic_reg.predict( vectorized_phrase )}")
print(f"Support Vector: {SVC.predict( vectorized_phrase )}")
print(f"Random Forest: {FOREST.predict( vectorized_phrase )}")
print(f"K-Nearest Neighbor: {KNN.predict( vectorized_phrase )}")
print(f"Naive Bayes: {NAIVE_bayes.predict( vectorized_phrase )}")

[[0 0 0 ... 0 0 0]]
Logistic Regression: ['fear']
Support Vector: ['fear']
Random Forest: ['fear']
K-Nearest Neighbor: ['anger']
Naive Bayes: ['joy']
