In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, make_scorer, matthews_corrcoef
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import re
import unicodedata
from gensim.models import Word2Vec

from imblearn.over_sampling import SMOTE

import time
from tqdm import tqdm

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from scripts.processing import *
# from scripts.mk_categories_word2vec_addmaincat import select_dataset_by_cat

In [None]:
dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv').iloc[:100000]

In [None]:
dfr = language_processing(dfr, verbose=True)

In [None]:
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
stopwords.extend(additional_stopwords)

def text_cleaning(txt):
    # txt = (unicodedata.normalize('NFKD', txt)).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    txt = txt.lower()
    words = re.sub(r'[^\w\s]', '', txt).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
# maincat = select_dataset_by_cat(categories=None, save_to_csv=False)
# dfr_maincat = dfr.set_index('business_id').join(maincat.set_index('business_id'), on='business_id', how='left', rsuffix='_business')
# dfr = dfr_maincat.query('maincat == "restaurants"')

In [None]:
#initialize vectorizer Parameter nach Susan Li
vectorizer = TfidfVectorizer(sublinear_tf=True, 
                             min_df=5, 
                             norm='l2', 
                             encoding='utf-8', 
                             ngram_range=(1, 3), 
                             stop_words=stopwords)

In [None]:
dfr_clicked = dfr.query('useful > 0 or cool > 0 or funny > 0')

In [None]:
# split data into feature and target 
X_clicked = dfr_clicked['text'].apply(lambda x: ' '.join(text_cleaning(x)))
# X = np.array(dfr['text'].apply(lambda x: len(x))).reshape(-1, 1)

In [None]:
# dfr.eval('useful_cool = useful + cool', inplace=True)
# y = dfr['useful_cool'].apply(lambda x: 1 if x > 1 else 0)
# y_clicked = dfr_clicked['useful'].apply(lambda x: 1 if x > 1 else 0)
y_clicked = dfr_clicked['useful'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# split data into train and test set
X_train_clicked, _, y_train_clicked, _ = train_test_split(X_clicked, y_clicked, random_state=RSEED, stratify=y_clicked)

In [None]:
dfr_test = pd.read_csv('../data/yelp_dataset/review_1819.csv').iloc[100000:125000]
X_test_clicked = dfr_test['text'].apply(lambda x: ' '.join(text_cleaning(x)))
y_test_clicked = dfr_test['useful'].apply(lambda x: 1 if x > 1 else 0)

In [None]:
def mcc(cm):
    tn, fp = cm[0]
    fn, tp = cm[1]
    return (tp*tn-fp*fn) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5

In [None]:
# fit and apply the vectorizer
X_train_clicked = vectorizer.fit_transform(X_train_clicked)
X_test_clicked = vectorizer.transform(X_test_clicked)

In [None]:
dfr_clicked.info()

In [None]:
dfr_clicked.query('useful > 0')

In [None]:
param_linsvc = {'penalty':('l1','l2'),
                'loss': ('hinge', 'squared_hinge'),
                'C': [0.02, 0.025, 0.03, 0.035, 0.04, 0.05, 0.06],
                'class_weight': ('balanced', None),
                'max_iter': [10000]
               }

mcc_scorer = make_scorer(matthews_corrcoef)
grid_linsvc = GridSearchCV(LinearSVC(), param_grid=param_linsvc, cv=5, scoring='precision',#'recall',#mcc_scorer, 
                           verbose=5, n_jobs=-1)

# fit the model
grid_linsvc.fit(X_train_clicked, y_train_clicked)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_linsvc.best_score_))
print("Best parameters:\n{}".format(grid_linsvc.best_params_))


In [None]:
y_pred_clicked = grid_linsvc.best_estimator_.predict(X_test_clicked)

# test the model
sns.heatmap(confusion_matrix(y_test_clicked, y_pred_clicked), annot=True, fmt='g')
print(mcc(confusion_matrix(y_test_clicked, y_pred_clicked)))
 
# show the classification report
print(classification_report(y_test_clicked, y_pred_clicked))

In [None]:
smote = SMOTE(sampling_strategy=0.45)

In [None]:
dfr_smote = dfr.copy()

# split data into feature and target 
X_smote = dfr_smote['text'].apply(lambda x: ' '.join(text_cleaning(x)))
# X = np.array(dfr['text'].apply(lambda x: len(x))).reshape(-1, 1)

# dfr.eval('useful_cool = useful + cool', inplace=True)
# y = dfr['useful_cool'].apply(lambda x: 1 if x > 1 else 0)
# y = dfr['useful'].apply(lambda x: 1 if x > 1 else 0)
y_smote = dfr_smote['useful'].apply(lambda x: 1 if x > 0 else 0)

# split data into train and test set
X_train_smote, X_test, y_train_smote, y_test = train_test_split(X_smote, y_smote, random_state=RSEED, stratify=y_smote)
# fit and apply the vectorizer
X_train_smote = vectorizer.fit_transform(X_train_smote)
X_test = vectorizer.transform(X_test)

In [None]:
X_train_smote, y_train_smote = smote.fit_resample(X_train_smote, y_train_smote)

In [None]:
param_linsvc = {'penalty':('l1','l2'),
                'loss': ('hinge', 'squared_hinge'),
                # 'C': [0.5, 1, 3, 5, 10],
                'C': [0.5, 1],
                'class_weight': ('balanced', None),
                'max_iter': [1000]
               }

# mcc_scorer = make_scorer(matthews_corrcoef)
grid_linsvc = GridSearchCV(LinearSVC(), param_grid=param_linsvc, cv=5, scoring='f1',#'precision',#'recall',#mcc_scorer, 
                           verbose=5, n_jobs=-1)

# fit the model
grid_linsvc.fit(X_train_smote, y_train_smote)    

# Show best parameters
print('Best score:\n{:.2f}'.format(grid_linsvc.best_score_))
print("Best parameters:\n{}".format(grid_linsvc.best_params_))

In [None]:
print("train")
y_pred_train = grid_linsvc.best_estimator_.predict(X_train_smote)
print(classification_report(y_train_smote, y_pred_train))
 

y_pred_smote = grid_linsvc.best_estimator_.predict(X_test)
# test the model
sns.heatmap(confusion_matrix(y_test, y_pred_smote), annot=True, fmt='g')
# print(mcc(confusion_matrix(y_test, y_pred_smote)))
print("test")
# show the classification report
print(classification_report(y_test, y_pred_smote))
