In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.svm import SVC

import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors, Word2Vec

# text pre-processing
!pip install pyspellchecker
from spellchecker import SpellChecker

# Lime
!pip install lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime.lime_text import IndexedString, IndexedCharacters
from lime.lime_base import LimeBase
from lime.lime_text import explanation

from google.colab import drive

# NLTK downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words("english"))

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=7f972c70f1b95b6b4fe84d1e2a8c8574e22bbc43880653bae81d4294fc9ff2cc
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
file_path = "/content/drive/MyDrive/Neural Network/data/text_emotion.csv"
data = pd.read_csv(file_path)
data = data[['sentiment','content']]
data = data.rename(columns={'sentiment': 'Emotion', 'content': 'Text'})

print(data['Emotion'].value_counts())
data.head()



neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: Emotion, dtype: int64


Unnamed: 0,Emotion,Text
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


# 1. Cleaning

In [4]:
spell = SpellChecker()
lemmatizer= WordNetLemmatizer()
le = LabelEncoder()
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'website', text)

def clean_text(text):
    ## Remove at(username)
    pattern = r'@[^@\s]+'
    text = re.sub(pattern, '', text)

    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def correct_word(word):
    corrected_word = spell.correction(word)

    if corrected_word is not None:
        return corrected_word
    else:
        return word

def preprocess_text(text):
  # Look into custom tokenizer later
  words = nltk.word_tokenize(text)
  process_words = []
  for word in words:
    if word.isdigit():
      continue

    word = word.lower()

    if word in stop_words:
      continue

    # word  = correct_word(word)

    process_words.append(word)

  return process_words

def normalize_and_tokenize(text):
  # Look into custom tokenizer later
  text = remove_urls(text)
  text = clean_text(text)
  return preprocess_text(text)

In [5]:
data['tokens'] = data.Text.apply(lambda Text : normalize_and_tokenize(Text))
data.head()

Unnamed: 0,Emotion,Text,tokens
0,empty,@tiffanylue i know i was listenin to bad habi...,"[know, listenin, bad, habit, earlier, started,..."
1,sadness,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, waitin, call]"
2,sadness,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]"
3,enthusiasm,wants to hang out with friends SOON!,"[wants, hang, friends, soon]"
4,neutral,@dannycastillo We want to trade with someone w...,"[want, trade, someone, houston, tickets, one]"


In [6]:
# Split the data into a training set and a temporary set
df_train, df_temp = train_test_split(data, test_size=0.3, random_state=42)

# Now, split the temporary set into validation and test sets
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# 2. Modelling

## TF_IDF

In [7]:
#Preprocess text
X_train = df_train['Text'].values
y_train = df_train['Emotion'].values

X_test = df_test['Text'].values
y_test = df_test['Emotion'].values

X_val = df_val['Text'].values
y_val = df_val['Emotion'].values

In [8]:
def train_model(model, data, targets):
    # Create a Pipeline object with a TfidfVectorizer and the given model
    text_clf = Pipeline([('vect',TfidfVectorizer()),
                         ('clf', model)])
    # Fit the model on the data and targets
    text_clf.fit(data, targets)
    return text_clf

In [9]:
def get_F1(trained_model,X,y):
    # Make predictions on the input data using the trained model
    predicted=trained_model.predict(X)
    # Calculate the F1 score for the predictions
    f1=f1_score(y,predicted, average=None)
    # Return the F1 score
    return f1

### Logistic Regression

In [10]:
#Train the model with the training data
log_reg = train_model(LogisticRegression(solver='liblinear',random_state = 0), X_train, y_train)

#Make a single prediction
y_pred=log_reg.predict(['Happy'])
y_pred

array(['happiness'], dtype=object)

In [11]:
#test the model with the test data
y_pred=log_reg.predict(X_test)

#calculate the accuracy
log_reg_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', log_reg_accuracy,'\n')

#calculate the F1 score
f1_Score = get_F1(log_reg,X_test,y_test)
pd.DataFrame(f1_Score, index=df_train.Emotion.unique(), columns=['F1 score'])

Accuracy:  0.35533333333333333 



Unnamed: 0,F1 score
sadness,0.0
boredom,0.0
neutral,0.0
worry,0.0
happiness,0.021583
enthusiasm,0.363073
love,0.219409
hate,0.441706
surprise,0.426738
fun,0.014035


### Decision Tree

In [12]:
#Train the model with the training data
DT = train_model(DecisionTreeClassifier(random_state = 0), X_train, y_train)

#test the model with the test data
y_pred=DT.predict(X_test)

#calculate the accuracy
DT_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', DT_accuracy,'\n')

#calculate the F1 score
f1_Score = get_F1(DT,X_test,y_test)
pd.DataFrame(f1_Score, index=df_train.Emotion.unique(), columns=['F1 score'])

Accuracy:  0.24366666666666667 



Unnamed: 0,F1 score
sadness,0.0
boredom,0.0
neutral,0.018349
worry,0.042105
happiness,0.060185
enthusiasm,0.258883
love,0.116343
hate,0.277003
surprise,0.334858
fun,0.068807


### SVM

In [13]:
#Train the model with the training data
SVM = train_model(SVC(random_state = 0), X_train, y_train)

#test the model with the test data
y_pred=SVM.predict(X_test)

#calculate the accuracy
SVM_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', SVM_accuracy,'\n')

#calculate the F1 score
f1_Score = get_F1(SVM,X_test,y_test)
pd.DataFrame(f1_Score, index=df_train.Emotion.unique(), columns=['F1 score'])

Accuracy:  0.3515 



Unnamed: 0,F1 score
sadness,0.0
boredom,0.0
neutral,0.0
worry,0.0
happiness,0.0
enthusiasm,0.363985
love,0.190871
hate,0.435045
surprise,0.428733
fun,0.00722


### Random Forest

In [14]:
#Train the model with the training data
RF = train_model(RandomForestClassifier(random_state = 0), X_train, y_train)

#test the model with the test data
y_pred=RF.predict(X_test)

#calculate the accuracy
RF_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', RF_accuracy,'\n')

#calculate the F1 score
f1_Score = get_F1(RF, X_test, y_test)
pd.DataFrame(f1_Score, index=df_train.Emotion.unique(), columns=['F1 score'])

Accuracy:  0.33216666666666667 



Unnamed: 0,F1 score
sadness,0.0
boredom,0.0
neutral,0.0
worry,0.0
happiness,0.014815
enthusiasm,0.314244
love,0.191667
hate,0.395626
surprise,0.420857
fun,0.007143


### Results

In [15]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree','Support Vector Machine','Random Forest'],
    'Accuracy': [log_reg_accuracy.round(2), DT_accuracy.round(2), SVM_accuracy.round(2), RF_accuracy.round(2)]})

models.sort_values(by='Accuracy', ascending=False).reset_index().drop(['index'], axis=1)

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.36
1,Support Vector Machine,0.35
2,Random Forest,0.33
3,Decision Tree,0.24
