In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords
# from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

import time
from tqdm import tqdm

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from scripts.processing import *

In [None]:
# load the review file into a dataframe

dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv').iloc[:100000]

In [None]:
# filter for only english reviews

dfr = language_processing(dfr, verbose=True)

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
#additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
#stopwords.extend(additional_stopwords)

In [None]:
# remove punctuation from the text in the initial df
dfr['text'] = dfr['text'].apply(remove_punctuation)

In [None]:
# split data into train and test set
# train_set, test_set = train_test_split(pd.concat([dfr['text'], dfr['useful'].apply(lambda x: 1 if x > 1 else 0)], axis=1), random_state=RSEED, stratify=dfr['useful'].apply(lambda x: 1 if x > 1 else 0)) # concat…
train_set, test_set = train_test_split(pd.concat([dfr['text'], dfr['useful'].apply(lambda x: 1 if x > 0 else 0)], axis=1), random_state=RSEED, stratify=dfr['useful'].apply(lambda x: 1 if x > 1 else 0)) # concat…

In [None]:
# building mostly on (and partly copied from) https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4
# documentation: https://radimrehurek.com/gensim/models/doc2vec.html
def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for token in nltk.word_tokenize(sent):
            tokens.append(token)
    return tokens

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    # targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents]) # TODO check importance of "steps" argument
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [None]:
# d2v = Doc2Vec.load('../models/doc2vec_100.model')
d2v = Doc2Vec.load('../models/doc2vec_300_small.model')

In [None]:
train_tagged = train_set.apply(lambda r: TaggedDocument(words=tokenize(r.text), tags=[r.useful]), axis=1)
y_train, X_train = vec_for_learning(d2v, train_tagged)

test_tagged = test_set.apply(lambda r: TaggedDocument(words=tokenize(r.text), tags=[r.useful]), axis=1)
y_test, X_test = vec_for_learning(d2v, test_tagged)

In [None]:
def mcc(cm):
    tn, fp = cm[0]
    fn, tp = cm[1]
    return (tp*tn-fp*fn) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5

In [None]:
# initialize the Classifier
logreg = LogisticRegression()

# fit the model
logreg.fit(X_train, y_train)    

# make predictions
y_pred = logreg.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
param_logreg = {'penalty':('l1','l2'),#, 'elasticnet', None),
                'C': [4, 5, 6],
                'solver': ['liblinear', 'lbfgs', 'sag']#, 'newton-cg', 'saga']
               }

# mcc_scorer = make_scorer(matthews_corrcoef)
grid_logreg = GridSearchCV(LogisticRegression(max_iter=10000), param_grid=param_logreg, cv=5, scoring='precision',#mcc_scorer, 
                           verbose=5, n_jobs=-1)

# fit the model
grid_logreg.fit(X_train, y_train)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_logreg.best_score_))
print("Best parameters:\n{}".format(grid_logreg.best_params_))

In [None]:
y_pred = grid_logreg.best_estimator_.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))
# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
LSVC = LinearSVC()

# fit the model
LSVC.fit(X_train, y_train)

# make predictions
y_pred = LSVC.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
LSVC.get_params()

In [None]:
param_linsvc = {'penalty':('l1','l2'),
                'loss': ('hinge', 'squared_hinge'),
                'C': [0.9, 1, 1.1, 1.2],
                'class_weight': ('balanced', None),
                'max_iter': [10000]
               }

# mcc_scorer = make_scorer(matthews_corrcoef)
grid_linsvc = GridSearchCV(LinearSVC(), param_grid=param_linsvc, cv=5, scoring='precision',#'recall',#mcc_scorer, 
                           verbose=5, n_jobs=-1)

# fit the model
grid_linsvc.fit(X_train, y_train)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_linsvc.best_score_))
print("Best parameters:\n{}".format(grid_linsvc.best_params_))


In [None]:
y_pred = grid_linsvc.best_estimator_.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))
 
# show the classification report
print(classification_report(y_test, y_pred))