In [92]:
### Import Dataset
import pandas as pd
dataframe = pd.read_csv("Emotions.csv")
# dataframe = dataframe.drop( range(20000, len(dataframe)) )

In [93]:
### Basic Text Preprocessing
import re
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

print(f"Original number of phrases: {len(dataframe)}")

stopwords = stopwords.words("english")
corpus = []
duplicates = set()

for i in range(len(dataframe)):

  # Lower case and Remove non-letters
  text = re.sub("[^a-zA-Z]", " ", dataframe.loc[i, "text"]).lower().split()
  
  # Lemmatization and Stemming
  stemmer = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text if word not in stopwords]
  text = " ".join(text)
  
  # Remove duplicate phrases
  if text in duplicates:
    dataframe = dataframe.drop(i)
  else:
    # Add text to corpus
    corpus.append(text)

  duplicates.add(text)  

print(f"Corpus: {len(corpus)} phrases")

Original number of phrases: 62612
Corpus: 26651 phrases


In [94]:
### Create Bag of Words and its dataframe
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform( corpus ).toarray()
label = dataframe['label']
label = label.reset_index( drop = True )

wordset = vectorizer.get_feature_names_out()
dataframe_BoW = pd.DataFrame(features, columns = wordset)

print(f"Corpus: {len(corpus)}\nLabel: {len(label)}")

Corpus: 26651
Label: 26651


In [95]:
### Create dictionary of sorted {Word: Frequency}
dict_BoW = {}

for word in dataframe_BoW.columns:
  total = sum(dataframe_BoW[word])
  dict_BoW[word] = total

dict_BoW = dict(reversed(sorted( dict_BoW.items(), key = lambda x: x[1] )))

In [96]:
### Save words with frequency less than 40
words_to_remove = set() 
for word, count in dict_BoW.items():
  if (count < 40):
    words_to_remove.add(word)
print(f"Words to Remove: {len(words_to_remove)} out of {len(features[0])}")

Words to Remove: 18051 out of 19140


In [97]:
### Remove the unfrequent words from the corpus

indicies_to_remove = [] # To later remove them from the label and the corpus

for i in range(len(corpus)):
  phrase = corpus[i]
  new_phrase = []

  # Keep the words that are not in the not_frequent list  
  for word in phrase.split():
    if not (word in words_to_remove):
      new_phrase.append(word)
  
  new_phrase = " ".join(new_phrase)
  # If the phrase is empty (all the words in the phrase are not frequent), save the phrase's index to remove it and remove its label 
  if new_phrase == "":
    indicies_to_remove.append(i)
  
  # Replace the current phrase in the corpus with the new_phrase
  corpus[i] = new_phrase


# Remove the empty phrases and their label
for index in indicies_to_remove:
  label = label.drop( index )
  corpus.remove( "" )

print(f"Corpus: {len(corpus)}\nLabel: {len(label)}")

Corpus: 26555
Label: 26555


In [98]:
### Create new Bag of Words with new features

vectorizer = CountVectorizer()
features = vectorizer.fit_transform( corpus ).toarray()

print(f"Phrases in features: {len(features)}\nWords in corpus: {len(features[0])}\nLabels: {len(label)}")

Phrases in features: 26555
Words in corpus: 1089
Labels: 26555


In [99]:
### Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( features, label.values, test_size = 0.2 )

print(f"X_train: {len(X_train)}, y_train: {len(y_train)}, X_test: {len(X_test)}, y_test: {len(y_test)}")
print(f"Words in X_train: {len(X_train[0])}")

### Prepare evaluation imports
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

X_train: 21244, y_train: 21244, X_test: 5311, y_test: 5311
Length of element in X_train: 1089


In [100]:
### Build Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression( max_iter = 10000 )
LR.fit( X_train, y_train )
LR_prediction = LR.predict( X_test )

LR_matrix = confusion_matrix( y_test, LR_prediction )
LR_accuracy = accuracy_score( y_test, LR_prediction )
print(f"Logistic Regression\n{LR_matrix}\n\nAccuracy: {LR_accuracy:.2}\n")

LR_score = cross_val_score(estimator = LR, y = y_train, X = X_train, cv = 10 )
print(f"\nScore: {LR_score}\n\nMean Score: {LR_score.mean():.2}")

Logistic Regression
[[ 592   60   65   10   72    0]
 [  59  768   85    6   60   16]
 [  41   50 1421   49   73   11]
 [  12    6  100  189   18    0]
 [  57   58   84    8 1179    6]
 [   1   32   21    1   10   91]]

Accuracy: 0.8


Score: [0.79670588 0.80376471 0.79482353 0.80141176 0.7933145  0.80508475
 0.79708098 0.77824859 0.80037665 0.80320151]

Mean Score: 0.8


In [101]:
### Build Decision Tree
from sklearn.tree import DecisionTreeClassifier
TREE = DecisionTreeClassifier(  )
TREE.fit( X_train, y_train )
TREE_prediction = TREE.predict( X_test )

TREE_matrix = confusion_matrix( y_test, TREE_prediction )
TREE_accuracy = accuracy_score( y_test, TREE_prediction )
TREE_score = cross_val_score(estimator = TREE, y = y_train, X = X_train, cv = 10 )
print(f"Decision Tree\n{TREE_matrix}\n\nAccuracy: {TREE_accuracy:.2}\n\nScore: {TREE_score}\n\nMean Score: {TREE_score.mean():.2}")

Decision Tree
[[ 594   58   72   16   58    1]
 [  62  752   83    5   64   28]
 [ 138  112 1120  105  149   21]
 [  11    9   90  196   18    1]
 [  79   82   91   21 1106   13]
 [   0   16   13    2    8  117]]

Accuracy: 0.73

Score: [0.72564706 0.73129412 0.72988235 0.72705882 0.72269303 0.72834275
 0.73258004 0.71798493 0.72175141 0.72975518]

Mean Score: 0.73


In [102]:
### Build Random Forest
from sklearn.ensemble import RandomForestClassifier
FOREST = RandomForestClassifier( n_estimators = 100)
FOREST.fit( X_train, y_train )
FOREST_prediction = FOREST.predict( X_test )

FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
print(f"Random Forest\n{FOREST_matrix}\n\nAccuracy: {FOREST_accuracy:.2}\n")

FOREST_score = cross_val_score(estimator = FOREST, y = y_train, X = X_train, cv = 10 )
print(f"\nScore: {FOREST_score}\n\nMean Score: {FOREST_score.mean():.2}")

Random Forest
[[ 616   46   66   14   57    0]
 [  54  787   60    5   58   30]
 [  66   57 1322   76  103   21]
 [  11    7   84  210   12    1]
 [  59   63   83   23 1155    9]
 [   2   17   10    1    4  122]]

Accuracy: 0.79


Score: [0.79670588 0.8        0.78823529 0.80611765 0.79425612 0.79896422
 0.80178908 0.78060264 0.79519774 0.79755179]

Mean Score: 0.8


In [103]:
### Build Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit( X_train, y_train )
NB_prediction = NB.predict( X_test )

NB_matrix = confusion_matrix( y_test, NB_prediction )
NB_accuracy = accuracy_score( y_test, NB_prediction )
print(f"Naive Bayes\n{NB_matrix}\n\nAccuracy: {NB_accuracy:.2}\n")

NB_score = cross_val_score(estimator = NB, y = y_train, X = X_train, cv = 10 )
print(f"\nScore: {NB_score}\n\nMean Score: {NB_score.mean():.2}")

Naive Bayes
[[301  41   9 228  17 203]
 [ 28 406  12 200  15 333]
 [ 43  62 455 606  15 464]
 [  8   7   9 205   6  90]
 [ 69 110  14 318 482 399]
 [  4  12   8  34   3  95]]

Accuracy: 0.37


Score: [0.35435294 0.38258824 0.38917647 0.36564706 0.37146893 0.37288136
 0.36016949 0.35499058 0.37947269 0.34934087]

Mean Score: 0.37


In [104]:
### Build K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier( n_neighbors = 5 )
KNN.fit( X_train, y_train )
KNN_prediction = KNN.predict( X_test )

KNN_matrix = confusion_matrix( y_test, KNN_prediction )
KNN_accuracy = accuracy_score( y_test, KNN_prediction )
print(f"K-Nearest Neighbors\n{KNN_matrix}\n\nAccuracy: {KNN_accuracy:.2}\n")

KNN_score = cross_val_score(estimator = KNN, y = y_train, X = X_train, cv = 10 )
print(f"\nScore: {KNN_score}\n\nMean Score: {KNN_score.mean():.2}")

K-Nearest Neighbors
[[ 516   65  126   28   64    0]
 [ 174  553  150   19   91    7]
 [ 199  183 1055   67  129   12]
 [  33   30  127  104   31    0]
 [ 211  149  330   44  654    4]
 [  13   31   42   10   22   38]]

Accuracy: 0.55


Score: [0.55623529 0.55482353 0.552      0.57694118 0.56167608 0.56450094
 0.5480226  0.5579096  0.56497175 0.58145009]

Mean Score: 0.56


In [105]:
### Build Support Vector Machine
from sklearn.svm import SVC
SVC = SVC( kernel = "rbf" )
SVC.fit( X_train, y_train )
SVC_prediction = SVC.predict( X_test )

SVC_matrix = confusion_matrix( y_test, SVC_prediction )
SVC_accuracy = accuracy_score( y_test, SVC_prediction )
print(f"SVM\n{SVC_matrix}\n\nAccuracy: {SVC_accuracy:.2}\n")

SVC_score = cross_val_score(estimator = SVC, y = y_train, X = X_train, cv = 10 )
print(f"Score: {SVC_score}\n\nMean Score: {SVC_score.mean():.2}")

SVM
[[ 572   47   88    8   83    1]
 [  57  770   96    5   65    1]
 [  48   52 1431   28   74   12]
 [   4    4  143  144   30    0]
 [  50   52  100   11 1174    5]
 [   1   36   30    0   12   77]]

Accuracy: 0.78

Score: [0.78776471 0.79858824 0.77270588 0.792      0.77448211 0.78766478
 0.78672316 0.7740113  0.78201507 0.78531073]

Mean Score: 0.78


In [108]:
raw_test_phrase = "you are amazing"
test_phrase = stemmer.stem(lemmatizer.lemmatize(raw_test_phrase))
test_phrase = vectorizer.transform( [test_phrase] ).toarray()

print(f'''
Phrase: {raw_test_phrase}

Logistic Regression: {LR.predict( test_phrase )}
Decision Tree: {TREE.predict( test_phrase )}
Random Forest: {FOREST.predict( test_phrase )}
Support Vector Machine: {SVC.predict( test_phrase )}
Naive Bayes: {NB.predict( test_phrase )}
K-Nearest Neighbor: {KNN.predict( test_phrase )}
''')


Phrase: you are amazing

Logistic Regression: ['joy']
Decision Tree: ['joy']
Random Forest: ['joy']
Support Vector Machine: ['joy']
Naive Bayes: ['surprise']
K-Nearest Neighbor: ['joy']

