In [1]:
#imports the necessary libraries

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
import gensim
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw')
nltk.download('omw-1.4')
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
print('done')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mateu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mateu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to
[nltk_data]     C:\Users\mateu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mateu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


done


In [2]:
#creates a function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    return text

print('done')

done


In [3]:
#reads the data from excel sheets and runs the preprocessing function through the data
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")
dfTrain = dfTrain.sample(frac=0.3, random_state=40)
dfTrain['Text'] = dfTrain['Text'].apply(preprocess_text)
dfTest['Text'] = dfTest['Text'].apply(preprocess_text)

y_train = dfTrain['Sentiment']
y_test = dfTest['Sentiment']
print('done')

done


In [4]:
#Bag-of-Words

count = CountVectorizer()
X_Bagtrain = count.fit_transform(dfTrain['Text'])
X_Bagtest = count.transform(dfTest['Text'])
y_train = dfTrain['Sentiment']
y_test = dfTest['Sentiment']
print('done')

done


In [5]:
#TF-IDF

tfidf = TfidfVectorizer()
X_TFtrain = tfidf.fit_transform(dfTrain['Text'])
X_TFtest = tfidf.transform(dfTest['Text'])
print('done')

done


In [6]:
df = pd.concat([dfTrain, dfTest], ignore_index=True)
tokenized_train = [nltk.word_tokenize(text) for text in dfTrain['Text']]
model = Word2Vec(sentences=df['Text'], vector_size=100, window=5, min_count=1, workers=4)
tokenized_test = [nltk.word_tokenize(text) for text in dfTest['Text']]

TEvectorized_data = []
#for loop that iterates through the testing data
for sentence in dfTest['Text']:
    sentence_list = nltk.word_tokenize(sentence)
    temp_vector_list = []
    for word in sentence_list:
        if word in model.wv:
            word_vector = model.wv[word]
            temp_vector_list.append(word_vector)
    if temp_vector_list:
        averaged_vector = np.average(temp_vector_list, axis=0)
        TEvectorized_data.append(averaged_vector)
    else:
        TEvectorized_data.append(np.zeros(100))

#for loop that goes through the training data
TRvectorized_data = []
for sentence in dfTrain['Text']:
    sentence_list = nltk.word_tokenize(sentence)
    temp_vector_list = []
    for word in sentence:
        if word in model.wv:
            word_vector = model.wv[word]
            temp_vector_list.append(word_vector)
    if temp_vector_list:
        averaged_vector = np.average(temp_vector_list, axis=0)
        TRvectorized_data.append(averaged_vector)
    else:
        TRvectorized_data.append(np.zeros(100))

print('done')

done


In [7]:

#imports models for classification
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report


#defines names for the models
lcBag = LogisticRegression(max_iter=1000)
lcTF = LogisticRegression(max_iter=1000)
lcW2V = LogisticRegression(max_iter=1000)
neigh = KNeighborsClassifier(n_neighbors=3)
svc = SVC(probability=True)
rfc = RandomForestClassifier()
lcBag.fit(X_Bagtrain, y_train)
lcTF.fit(X_TFtrain, y_train)
lcW2V.fit(TRvectorized_data, y_train)

print('done')

done


In [8]:

#compares the performance of bag of words, tf-idf, and word2vec
y_lcB_predicted = lcBag.predict(X_Bagtest)
y_lcB_pred_proba = lcBag.predict_proba(X_Bagtest)
y_lcT_predicted = lcBag.predict(X_TFtest)
y_lcT_pred_proba = lcBag.predict_proba(X_TFtest)
y_lcW_predicted = lcW2V.predict(TEvectorized_data)
y_lcW_pred_proba = lcW2V.predict_proba(TEvectorized_data)
print(classification_report(y_test, y_lcB_predicted))
print(classification_report(y_test, y_lcT_predicted))
print(classification_report(y_test, y_lcW_predicted))
print('done')

              precision    recall  f1-score   support

           0       0.69      0.90      0.78       177
           1       0.86      0.60      0.71       182

    accuracy                           0.75       359
   macro avg       0.77      0.75      0.74       359
weighted avg       0.77      0.75      0.74       359

              precision    recall  f1-score   support

           0       0.59      0.97      0.73       177
           1       0.93      0.34      0.50       182

    accuracy                           0.65       359
   macro avg       0.76      0.66      0.62       359
weighted avg       0.76      0.65      0.61       359

              precision    recall  f1-score   support

           0       0.50      0.69      0.58       177
           1       0.52      0.32      0.40       182

    accuracy                           0.51       359
   macro avg       0.51      0.51      0.49       359
weighted avg       0.51      0.51      0.49       359

done


In [9]:

dfTrain = dfTrain.sample(frac=0.07, random_state=40)
y_train = dfTrain['Sentiment']
count = CountVectorizer()
X_Bagtrain = count.fit_transform(dfTrain['Text'])
X_Bagtest = count.transform(dfTest['Text'])
y_train = dfTrain['Sentiment']

#since bag-of-words was the best model, this code uses the data from bag-of-words to test using other models
neigh.fit(X_Bagtrain, y_train)
y_neigh_predicted = neigh.predict(X_Bagtest)
y_neigh_pred_proba = neigh.predict_proba(X_Bagtest)
print(classification_report(y_test, y_neigh_predicted))



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score   support

           0       0.56      0.86      0.68       177
           1       0.72      0.35      0.47       182

    accuracy                           0.60       359
   macro avg       0.64      0.60      0.57       359
weighted avg       0.64      0.60      0.57       359



In [10]:
svc.fit(X_Bagtrain, y_train)
y_svc_predicted = svc.predict(X_Bagtest)
y_svc_pred_proba = svc.predict_proba(X_Bagtest)
print(classification_report(y_test, y_svc_predicted))

              precision    recall  f1-score   support

           0       0.59      0.99      0.74       177
           1       0.97      0.34      0.50       182

    accuracy                           0.66       359
   macro avg       0.78      0.66      0.62       359
weighted avg       0.78      0.66      0.62       359



In [11]:
rfc.fit(X_Bagtrain, y_train)
y_rfc_predicted = rfc.predict(X_Bagtest)
y_rfc_pred_proba = rfc.predict_proba(X_Bagtest)
print(classification_report(y_test, y_rfc_predicted))

              precision    recall  f1-score   support

           0       0.56      0.98      0.71       177
           1       0.92      0.26      0.41       182

    accuracy                           0.62       359
   macro avg       0.74      0.62      0.56       359
weighted avg       0.75      0.62      0.56       359

