In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

# importing all needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords
# from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import fasttext
import string

import time
from tqdm import tqdm

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from scripts.processing import *

In [None]:
# load the review file into a dataframe

dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv')

In [None]:
# filter for only english reviews

dfr = language_processing(dfr, verbose=True)

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
#additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
#stopwords.extend(additional_stopwords)

In [None]:
# remove punctuation from the text in the initial df
# dfr['text'] = dfr['text'].apply(remove_punctuation)
dfr['text'] = dfr['text'].apply(lambda s: s.translate(str.maketrans('' ,'', string.punctuation)))

In [None]:
X = dfr['text']
# y = dfr['useful'].apply(lambda x: 1 if x > 1 else 0)
y = dfr['useful'].apply(lambda x: 1 if x > 0 else 0)
# split data into train and test set
train_set, test_set = train_test_split(pd.concat([X, y], axis=1), random_state=RSEED, stratify=y) # concat…

In [None]:
train_set.query('useful != 0').count().useful

In [None]:
train_set.query('useful != 0').count().useful / train_set.count().useful

In [None]:
with open('train_set.txt', 'w') as f:
    for idx, row in train_set.iterrows():
        text_one_line = row.text.replace('\n', ' ') # TODO check how tokenization is done
        f.write(f"__label__{int(row.useful)} {text_one_line}\n")

In [None]:
# documentation: https://fasttext.cc/docs/en/python-module.html
ft_model = fasttext.train_supervised('train_set.txt')

In [None]:
# with open('test_set.txt', 'w') as f:
#     for idx, row in test_set.iterrows():
#         text_one_line = row.text.replace('\n', ' ') # TODO check how tokenization is done
#         f.write(f"__label__{int(row.useful)} {text_one_line}\n")

In [None]:
# ft_model.test('test_set.txt')

In [None]:
y_test = list(test_set.useful.apply(lambda x: f"__label__{x}"))

In [None]:
# make predictions
y_pred = [x[0] for x in ft_model.predict(list(test_set.text.apply(lambda x: x.replace('\n', ' '))), k=1)[0]]
 
# test the model
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')

# show the classification report
print(classification_report(y_test, y_pred))

In [None]:
def mcc(cm):
    tn, fp = cm[0]
    fn, tp = cm[1]
    print(tn, fp)
    print(fn, tp)
    return (tp*tn-fp*fn) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5

In [None]:
print(mcc(confusion_matrix(y_pred, y_test)))

In [None]:
for i, item in enumerate(list(test_set.text.apply(lambda x: x.replace('\n', ' ')))):
    if i > 9:
        break
    print(ft_model.predict(item, k=2), test_set.useful.iloc[i], item)