In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords
# from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec

import time
from tqdm import tqdm

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from modeling.processing import *

In [None]:
dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv')

In [None]:
# filter for only english reviews

dfr = language_processing(dfr, verbose=True)

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
#additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
#stopwords.extend(additional_stopwords)

In [None]:
# remove punctuation from the text in the initial df
dfr['text'] = dfr['text'].apply(remove_punctuation)

In [None]:
X = dfr['text']
y = dfr['stars']

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED)

In [None]:
# split data into feature and target 
import multiprocessing


sentences_train = [doc.split() for doc in X_train] # TODO here (and below): use NLTK tokenizers instead
# initialize word2vec (https://radimrehurek.com/gensim/models/word2vec.html)
# w2v = Word2Vec(sentences=sentences_train, vector_size=100, window=5, min_alpha=1, workers=4) # TODO SAVE!
# w2v = Word2Vec(sentences=sentences_train, vector_size=50, window=3, min_alpha=1, workers=8) # TODO SAVE!
w2v = Word2Vec(sentences=sentences_train, vector_size=200, window=3, min_alpha=1, workers=8) # TODO SAVE!

In [None]:
def get_sentences_embeddings_means(sentences):
    sentences_embeddings_means = []
    words_ignored = []
    for sent in sentences:
        sent_embeddings = []
        for token in sent:
            try:
                sent_embeddings.append(w2v.wv[token])
            except: # not possible for all words; ignored in w2c model because they are stop words!? (cf. ignored words listed below)
                words_ignored.append(token)
        sentences_embeddings_means.append(np.mean(sent_embeddings, axis=0))
    return sentences_embeddings_means, words_ignored

In [None]:
sentences_embeddings_means_train, words_ignored_train = get_sentences_embeddings_means(sentences_train)
X_train = sentences_embeddings_means_train

In [None]:
# print('Ignored words:')
# print(sorted(set(words_ignored_train))) # TODO check how long this takes

In [None]:
sentences_test = [doc.split() for doc in X_test]

In [None]:
sentences_embeddings_means_test, words_ignored_test = get_sentences_embeddings_means(sentences_test)
X_test = sentences_embeddings_means_test

In [None]:
# print('Ignored words:')
# print(sorted(set(words_ignored_test))) # TODO check how long this takes

In [None]:
# 
# # initialize the Classifier
# MNB = MultinomialNB()
# 
# # fit the model
# MNB.fit(X_train, y_train)
# 
# # make predictions
# y_pred = MNB.predict(X_test)
# 
# # test the model
# sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')
# 
# # show the classification report
# print(classification_report(y_pred, y_test))

In [None]:
# # initialize the Classifier
# LSVC = LinearSVC()
# 
# # fit the model
# LSVC.fit(X_train, y_train)
# 
# # make predictions
# y_pred = LSVC.predict(X_test)
# 
# # test the model
# sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')
# 
# # show the classification report
# print(classification_report(y_pred, y_test))

In [None]:
# initialize the Classifier
logreg = LogisticRegression()

# fit the model
logreg.fit(X_train, y_train)    

# make predictions
y_pred = logreg.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt='g')

# show the classification report
print(classification_report(y_pred, y_test))