In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, make_scorer, matthews_corrcoef
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import re
import unicodedata
from gensim.models import Word2Vec

import time
from tqdm import tqdm
import json

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from scripts.processing import *
# from scripts.mk_categories_word2vec_addmaincat import select_dataset_by_cat

In [None]:
dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv').iloc[:100000]

In [None]:
dfr = language_processing(dfr, verbose=True)

In [None]:
dfr.query('useful > 0')

In [None]:
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
stopwords.extend(additional_stopwords)

def text_cleaning(txt):
    # txt = (unicodedata.normalize('NFKD', txt)).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    txt = txt.lower()
    words = re.sub(r'[^\w\s]', '', txt).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
# maincat = select_dataset_by_cat(categories=None, save_to_csv=False)
# dfr_maincat = dfr.set_index('business_id').join(maincat.set_index('business_id'), on='business_id', how='left', rsuffix='_business')
# dfr = dfr_maincat.query('maincat == "restaurants"')

In [None]:
#initialize vectorizer Parameter nach Susan Li
vectorizer = TfidfVectorizer(sublinear_tf=True, 
                             min_df=5, 
                             norm='l2', 
                             encoding='utf-8', 
                             ngram_range=(1, 5), 
                             stop_words=stopwords)

In [None]:
# split data into feature and target 
X = dfr['text'].apply(lambda x: ' '.join(text_cleaning(x)))
# X = np.array(dfr['text'].apply(lambda x: len(x))).reshape(-1, 1)

In [None]:
# dfr.eval('useful_cool = useful + cool', inplace=True)
# y = dfr['useful_cool'].apply(lambda x: 1 if x > 1 else 0)
# y = dfr['useful'].apply(lambda x: 1 if x > 1 else 0)
y = dfr['useful'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED, stratify=y)

In [None]:
# fit and apply the vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
def mcc(cm):
    tn, fp = cm[0]
    fn, tp = cm[1]
    return (tp*tn-fp*fn) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5

In [None]:
# initialize the Classifier
logreg = LogisticRegression()

# fit the model
logreg.fit(X_train, y_train)    

# make predictions
y_pred = logreg.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
param_logreg = {'penalty':('l1','l2', 'elasticnet'),#, None),
                'C': [5, 8, 9, 10, 20, 30],
                'solver': ['liblinear', 'lbfgs', 'sag'],#, 'newton-cg', 'saga']
               }

mcc_scorer = make_scorer(matthews_corrcoef)
grid_logreg = GridSearchCV(LogisticRegression(), param_grid=param_logreg, cv=5, scoring='precision',#mcc_scorer, 
                           verbose=0, n_jobs=-1)

# fit the model
grid_logreg.fit(X_train, y_train)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_logreg.best_score_))
print("Best parameters:\n{}".format(grid_logreg.best_params_))

In [None]:
y_pred = grid_logreg.best_estimator_.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))
# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
tree = DecisionTreeClassifier()

# fit the model
tree.fit(X_train, y_train)    

# make predictions
y_pred = tree.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
xtree = ExtraTreeClassifier()

# fit the model
xtree.fit(X_train, y_train)    

# make predictions
y_pred = xtree.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
MNB = MultinomialNB()

# fit the model
MNB.fit(X_train, y_train)

# make predictions
y_pred = MNB.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
BNB = BernoulliNB()

# fit the model
BNB.fit(X_train, y_train)

# make predictions
y_pred = BNB.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# initialize the Classifier
LSVC = LinearSVC()

# fit the model
LSVC.fit(X_train, y_train)

# make predictions
y_pred = LSVC.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
param_linsvc = {'penalty':('l1','l2'),
                'loss': ('hinge', 'squared_hinge'),
                'C': [0.05, 0.06, 0.065, 0.07, 0.075, 0.08],
                'class_weight': ('balanced', None),
                'max_iter': [10000]
               }

mcc_scorer = make_scorer(matthews_corrcoef)
grid_linsvc = GridSearchCV(LinearSVC(), param_grid=param_linsvc, cv=5, scoring='precision',#'recall',#mcc_scorer, 
                           verbose=5, n_jobs=-1)

# fit the model
grid_linsvc.fit(X_train, y_train)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_linsvc.best_score_))
print("Best parameters:\n{}".format(grid_linsvc.best_params_))


In [None]:
y_pred = grid_linsvc.best_estimator_.predict(X_test)

# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test, y_pred)))
 
# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
# # initialize the Classifier
# knn = KNeighborsClassifier()
# 
# # fit the model
# knn.fit(X_train, y_train)
# 
# # make predictions
# y_pred = knn.predict(X_test)
# 
# # test the model
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
# print(mcc(confusion_matrix(y_test, y_pred)))
# 
# # show the classification report
# print(classification_report(y_test, y_pred))

In [None]:
# # initialize the Classifier
# svc = SVC()
# 
# # fit the model
# svc.fit(X_train, y_train)
# 
# # make predictions
# y_pred = svc.predict(X_test)
# 
# # test the model
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')
# print(mcc(confusion_matrix(y_test, y_pred)))
# 
# # show the classification report
# print(classification_report(y_test, y_pred))

In [None]:
def sort_reviews(df):
    review_list = []
    for idx, review in df.iterrows():
        review_vectors = vectorizer.transform([review.text])
        conf_score = grid_linsvc.best_estimator_.decision_function(review_vectors)
        review_list.append((conf_score, review.text))

    review_list.sort(key=lambda x: x[0], reverse=True)
    return review_list

In [None]:
dfr_business_0 = dfr[dfr.business_id == dfr.business_id.unique()[0]]

for i, item in enumerate(sort_reviews(dfr_business_0)):
    # print(item)
    print(f"{i+1}:\n{item[1]}\n")

In [None]:
# test_texts.json contains list of strings (each string is one review)
with open('../data/test_texts.json') as json_f:
    test_texts = json.load(json_f)

In [None]:
df_test = pd.DataFrame({'text': test_texts})

In [None]:
for i, item in enumerate(sort_reviews(df_test)):
    # print(item)
    print(f"{i+1}:\n{item[1]}\n")