In [69]:
### Import Dataset
import pandas as pd
dataframe = pd.read_csv("Emotions.csv")
dataframe = dataframe.drop( range(20000, len(dataframe)) )

In [70]:
### Basic Text Preprocessing
import re
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

print(f"Original number of phrases: {len(dataframe)}")

stopwords = stopwords.words("english")
corpus = []
duplicates = set()

for i in range(len(dataframe)):

  # Lower case and Remove non-letters
  text = re.sub("[^a-zA-Z]", " ", dataframe.loc[i, "text"]).lower().split()
  
  # Lemmatization and Stemming
  stemmer = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text if word not in stopwords]
  text = " ".join(text)
  
  ### Remove duplicate phrases
  if text in duplicates:
    dataframe = dataframe.drop(i)
  else:
    # Add text to corpus
    corpus.append(text)

  duplicates.add(text)  

print(f"Corpus: {len(corpus)}\nLabels: {len(dataframe)}")

Original number of phrases: 20000
Corpus: 6878
Labels: 6878


In [71]:
### Generate ngrams: 
from nltk.util import ngrams

for i in range(len(corpus)):
  phrase = corpus[i]
  ngram = list(ngrams(phrase.split(), 4))
  corpus[i] = ngram

In [72]:
### Create a new datframe organizing each ngram with its corresponding sentiment

# Prepare labels
label = dataframe["label"]
label = label.reset_index( drop = True )

# Initialize new empty dataframe
dataframe = pd.DataFrame( columns = ["Text", "Sentiment"] )

# Loop over the corpus' count, select each phrase from corpus and each label from labels, and them to the new dataframe
row_count = 0
for i in range(len(corpus)):
  words_tuple = corpus[i]
  phrase_label = label[i]
  for j in range(len(words_tuple)):
    words = words_tuple[j]
    phrase = " ".join(words)

    dataframe.loc[row_count, "Text"] = phrase
    dataframe.loc[row_count, "Sentiment"] = phrase_label
    row_count += 1

print(f"Count of Texts and Labels: {len(dataframe)}")

Count of Texts and Labels: 43808


In [109]:
### Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

features = dataframe["Text"]
label = dataframe["Sentiment"]

vectorizer = TfidfVectorizer(  )
vectorized_features = vectorizer.fit_transform(features).toarray() 

In [115]:
# # test_df = pd.DataFrame(new_features)
# # for column in test_df:
# #   print(sum(column)) 

# duplicates = set()
# for phrase in features:
#   duplicates.add(phrase)

# print(f"Features: {len(features)} - Duplicates length: {len(duplicates)}")

(43808, 43808)

In [116]:
### Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( vectorized_features, label.values, test_size = 0.2 )

print(f"X_train: {len(X_train)}, y_train: {len(y_train)}, X_test: {len(X_test)}, y_test: {len(y_test)}")
print(f"Words in X_train: {len(X_train[0])}")

### Prepare evaluation imports
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

X_train: 35046, y_train: 35046, X_test: 8762, y_test: 8762
Words in X_train: 12512


In [75]:
# ### Build Logistic Regression
# from sklearn.linear_model import LogisticRegression
# LR = LogisticRegression( max_iter = 10000 )
# LR.fit( X_train, y_train )
# LR_prediction = LR.predict( X_test )

# LR_matrix = confusion_matrix( y_test, LR_prediction )
# LR_accuracy = accuracy_score( y_test, LR_prediction )
# print(f"Logistic Regression\n{LR_matrix}\n\nAccuracy: {LR_accuracy:.2}\n")

# LR_score = cross_val_score(estimator = LR, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {LR_score}\n\nMean Score: {LR_score.mean():.2}")

In [76]:
# ### Build Decision Tree
# from sklearn.tree import DecisionTreeClassifier
# TREE = DecisionTreeClassifier(  )
# TREE.fit( X_train, y_train )
# TREE_prediction = TREE.predict( X_test )

# TREE_matrix = confusion_matrix( y_test, TREE_prediction )
# TREE_accuracy = accuracy_score( y_test, TREE_prediction )
# TREE_score = cross_val_score(estimator = TREE, y = y_train, X = X_train, cv = 10 )
# print(f"Decision Tree\n{TREE_matrix}\n\nAccuracy: {TREE_accuracy:.2}\n\nScore: {TREE_score}\n\nMean Score: {TREE_score.mean():.2}")

In [117]:
### Build Random Forest
# from sklearn.ensemble import RandomForestClassifier
# FOREST = RandomForestClassifier( n_estimators = 100)
# FOREST.fit( X_train, y_train )
# FOREST_prediction = FOREST.predict( X_test )

# FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
# FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
# print(f"Random Forest\n{FOREST_matrix}\n\nAccuracy: {FOREST_accuracy:.2}\n")

# FOREST_score = cross_val_score(estimator = FOREST, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {FOREST_score}\n\nMean Score: {FOREST_score.mean():.2}")

In [78]:
# ### Build Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# NB = GaussianNB()
# NB.fit( X_train, y_train )
# NB_prediction = NB.predict( X_test )

# NB_matrix = confusion_matrix( y_test, NB_prediction )
# NB_accuracy = accuracy_score( y_test, NB_prediction )
# print(f"Naive Bayes\n{NB_matrix}\n\nAccuracy: {NB_accuracy:.2}\n")

# NB_score = cross_val_score(estimator = NB, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {NB_score}\n\nMean Score: {NB_score.mean():.2}")

In [79]:
# ### Build K-Nearest Neighbors
# from sklearn.neighbors import KNeighborsClassifier
# KNN = KNeighborsClassifier( n_neighbors = 5 )
# KNN.fit( X_train, y_train )
# KNN_prediction = KNN.predict( X_test )

# KNN_matrix = confusion_matrix( y_test, KNN_prediction )
# KNN_accuracy = accuracy_score( y_test, KNN_prediction )
# print(f"K-Nearest Neighbors\n{KNN_matrix}\n\nAccuracy: {KNN_accuracy:.2}\n")

# KNN_score = cross_val_score(estimator = KNN, y = y_train, X = X_train, cv = 10 )
# print(f"\nScore: {KNN_score}\n\nMean Score: {KNN_score.mean():.2}")

In [80]:
# ### Build Support Vector Machine
# from sklearn.svm import SVC
# SVC = SVC( kernel = "rbf" )
# SVC.fit( X_train, y_train )
# SVC_prediction = SVC.predict( X_test )

# SVC_matrix = confusion_matrix( y_test, SVC_prediction )
# SVC_accuracy = accuracy_score( y_test, SVC_prediction )
# print(f"SVM\n{SVC_matrix}\n\nAccuracy: {SVC_accuracy:.2}\n")

# SVC_score = cross_val_score(estimator = SVC, y = y_train, X = X_train, cv = 10 )
# print(f"Score: {SVC_score}\n\nMean Score: {SVC_score.mean():.2}")

In [81]:
# raw_test_phrase = "you are amazing"
# test_phrase = stemmer.stem(lemmatizer.lemmatize(raw_test_phrase))
# test_phrase = vectorizer.transform( [test_phrase] ).toarray()

# print(f'''
# Phrase: {raw_test_phrase}

# Logistic Regression: {LR.predict( test_phrase )}
# Decision Tree: {TREE.predict( test_phrase )}
# Random Forest: {FOREST.predict( test_phrase )}
# Support Vector Machine: {SVC.predict( test_phrase )}
# Naive Bayes: {NB.predict( test_phrase )}
# K-Nearest Neighbor: {KNN.predict( test_phrase )}
# ''')