In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import time
from tqdm import tqdm

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

In [None]:
# load the first 100k lines of the review file into a dataframe

dfr = pd.read_csv('../data/yelp_dataset/review_1819_eng.csv')

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')


In [None]:
#define function for textcleaning
punctuation = ['"', '(', ')', '-', '$', ',', '+', "'", "\n", "\r"]

def clean_text(text):   
    cleaned_text = "".join(u for u in text if u not in punctuation)
    return cleaned_text

In [None]:
# remove punctuation from the text in the initial df
dfr['text'] = dfr['text'].apply(clean_text)

In [None]:
#initialize vectorizer Parameter nach Susan Li
vectorizer = TfidfVectorizer(   sublinear_tf=True, 
                                min_df=5, 
                                norm='l2', 
                                encoding='latin-1', 
                                ngram_range=(1, 2), 
                                stop_words=stopwords)



In [None]:
# split data into feature and target 
X = dfr['text']
y = dfr['stars']

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED)

# fit and apply the vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [None]:

# initialize the Classifier
MNB = MultinomialNB()

# fit the model
MNB.fit(X_train, y_train)

# make predictions
y_pred = MNB.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')

# show the classification report
print(classification_report(y_pred, y_test))

In [None]:
# initialize the Classifier
LSVC = LinearSVC()

# fit the model
LSVC.fit(X_train, y_train)

# make predictions
y_pred = LSVC.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')

# show the classification report
print(classification_report(y_pred, y_test))

In [None]:
# initialize the Classifier
logreg = LogisticRegression()

# fit the model
logreg.fit(X_train, y_train)    

# make predictions
y_pred = logreg.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')

# show the classification report
print(classification_report(y_pred, y_test))