In [14]:
### Import Dataset
import pandas as pd
dataframe = pd.read_csv("Emotions.csv")
dataframe = dataframe.drop( range(10000, len(dataframe)) )

In [15]:
### Basic Text Preprocessing
import re
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

print(f"Original number of phrases: {len(dataframe)}")

stopwords = stopwords.words("english")
corpus = []
duplicates = set()

for i in range(len(dataframe)):

  # Lower case and Remove non-letters
  text = re.sub("[^a-zA-Z]", " ", dataframe.loc[i, "text"]).lower().split()
  
  # Lemmatization and Stemming
  stemmer = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text if word not in stopwords]
  text = " ".join(text)
  
  ### Remove duplicate phrases
  if text in duplicates:
    dataframe = dataframe.drop(i)
  else:
    # Add text to corpus
    corpus.append(text)

  duplicates.add(text)  

print(f"Corpus: {len(corpus)}\nLabels: {len(dataframe)}")

Original number of phrases: 10000
Corpus: 6878
Labels: 6878


In [16]:
### Create TF-IDF Features

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorized_features = vectorizer.fit_transform(corpus)
wordset = vectorizer.get_feature_names_out()

In [17]:
# ### Visualize scores
# for word in wordset:
#   index = vectorizer.vocabulary_.get(word)
#   print(f"{word}: {vectorizer.idf_[index]}")

### Prepare feature and label
vectorized_features = vectorized_features.toarray()
label = dataframe["label"]

In [18]:
### Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( vectorized_features, label.values, test_size = 0.2 )

print(f"X_train: {len(X_train)}, y_train: {len(y_train)}, X_test: {len(X_test)}, y_test: {len(y_test)}")
print(f"Words in X_train: {len(X_train[0])}")

### Prepare evaluation imports
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

X_train: 5502, y_train: 5502, X_test: 1376, y_test: 1376
Words in X_train: 12655


In [19]:
### Build Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression( max_iter = 10000 )
LR.fit( X_train, y_train )
LR_prediction = LR.predict( X_test )

LR_matrix = confusion_matrix( y_test, LR_prediction )
LR_accuracy = accuracy_score( y_test, LR_prediction )
print(f"Logistic Regression\n{LR_matrix}\n\nAccuracy: {LR_accuracy:.2}\n")

LR_score = cross_val_score(estimator = LR, y = y_train, X = X_train, cv = 10 )
print(f"\nScore: {LR_score}\n\nMean Score: {LR_score.mean():.2}")

Logistic Regression
[[285  29   8   5]
 [  7 413   7  14]
 [  8  31 300   5]
 [ 12  33   5 214]]

Accuracy: 0.88


Score: [0.8784029  0.88747731 0.89272727 0.89818182 0.87454545 0.83636364
 0.86909091 0.85636364 0.85272727 0.86      ]

Mean Score: 0.87


In [20]:
# ### Build Decision Tree
# from sklearn.tree import DecisionTreeClassifier
# TREE = DecisionTreeClassifier(  )
# TREE.fit( X_train, y_train )
# TREE_prediction = TREE.predict( X_test )

# TREE_matrix = confusion_matrix( y_test, TREE_prediction )
# TREE_accuracy = accuracy_score( y_test, TREE_prediction )
# TREE_score = cross_val_score(estimator = TREE, y = y_train, X = X_train, cv = 10 )
# print(f"Decision Tree\n{TREE_matrix}\n\nAccuracy: {TREE_accuracy:.2}\n\nScore: {TREE_score}\n\nMean Score: {TREE_score.mean():.2}")

In [21]:
# ## Build Random Forest
# from sklearn.ensemble import RandomForestClassifier
# FOREST = RandomForestClassifier( n_estimators = 100)
# FOREST.fit( X_train, y_train )
# FOREST_prediction = FOREST.predict( X_test )

# FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
# FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
# print(f"Random Forest\n{FOREST_matrix}\n\nAccuracy: {FOREST_accuracy:.2}\n")

# FOREST_score = cross_val_score(estimator = FOREST, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {FOREST_score}\n\nMean Score: {FOREST_score.mean():.2}")

In [22]:
# ### Build Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# NB = GaussianNB()
# NB.fit( X_train, y_train )
# NB_prediction = NB.predict( X_test )

# NB_matrix = confusion_matrix( y_test, NB_prediction )
# NB_accuracy = accuracy_score( y_test, NB_prediction )
# print(f"Naive Bayes\n{NB_matrix}\n\nAccuracy: {NB_accuracy:.2}\n")

# NB_score = cross_val_score(estimator = NB, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {NB_score}\n\nMean Score: {NB_score.mean():.2}")

In [23]:
# ### Build K-Nearest Neighbors
# from sklearn.neighbors import KNeighborsClassifier
# KNN = KNeighborsClassifier( n_neighbors = 5 )
# KNN.fit( X_train, y_train )
# KNN_prediction = KNN.predict( X_test )

# KNN_matrix = confusion_matrix( y_test, KNN_prediction )
# KNN_accuracy = accuracy_score( y_test, KNN_prediction )
# print(f"K-Nearest Neighbors\n{KNN_matrix}\n\nAccuracy: {KNN_accuracy:.2}\n")

# KNN_score = cross_val_score(estimator = KNN, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {KNN_score}\n\nMean Score: {KNN_score.mean():.2}")

In [24]:
# ### Build Support Vector Machine
# from sklearn.svm import SVC
# SVC = SVC( kernel = "rbf" )
# SVC.fit( X_train, y_train )
# SVC_prediction = SVC.predict( X_test )

# SVC_matrix = confusion_matrix( y_test, SVC_prediction )
# SVC_accuracy = accuracy_score( y_test, SVC_prediction )
# print(f"SVM\n{SVC_matrix}\n\nAccuracy: {SVC_accuracy:.2}\n")

# SVC_score = cross_val_score(estimator = SVC, y = y_train, X = X_train, cv = 10 )
# print(f"Score: {SVC_score}\n\nMean Score: {SVC_score.mean():.2}")

In [32]:
raw_test_phrase = "I hate your love"
test_phrase = stemmer.stem(lemmatizer.lemmatize(raw_test_phrase))
test_phrase = vectorizer.transform( [test_phrase] ).toarray()

print(f"Phrase: {raw_test_phrase}\n")
print(f"Logistic Regression: {LR.predict( test_phrase )}\n")
# print(f"Decision Tree: {TREE.predict( test_phrase )}\n")
# print(f"Random Forest: {FOREST.predict( test_phrase )}\n")
# print(f"Support Vector Machine: {SVC.predict( test_phrase )}\n")
# print(f"Naive Bayes: {NB.predict( test_phrase )}\n")
# print(f"K-Nearest Neighbor: {KNN.predict( test_phrase )}\n")



Phrase: I hate your love!

Logistic Regression: ['fear']

