# Model Training

Prepare dataframe 

In [1]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from utils.text_preprocess import LemmaTokenizer
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.experimental import enable_halving_search_cv

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
        
df = pd.read_csv('data/amazon_reviews.txt', sep='\t')
df['VERIFIED_PURCHASE'] = df['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})

word2vec_model_file = 'models/word2vec_100_v2.model'
model = Word2Vec.load(word2vec_model_file)

def average_word_embeddings(sentence, model):
    words = LemmaTokenizer()(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(word_vectors, axis=0)

# Define a function to process each review
def process_review(review_text, model):
    sentences = sent_tokenize(review_text)
    sentence_embeddings = [average_word_embeddings(sentence, model) for sentence in sentences]
    return sentence_embeddings

df['Sentence_Embeddings'] = df['REVIEW_TEXT'].apply(lambda x: process_review(x, model))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


Create pipelines

In [2]:
from utils.transformers import *
    
classifiers = [
    CalibratedClassifierCV(LinearSVC(random_state=42, dual="auto")),
    RandomForestClassifier(random_state=42, n_jobs=-1),
]
vectorizers = [
    TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None),
    CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None),
]

pipes = []
pipes_names = []
for clf in classifiers:
    for vectorizer in vectorizers:
        for useEmbeddings in [True]:
            for useVP in [True, False]:
                transformers = [
                    ('vectorizer', vectorizer, 'REVIEW_TEXT')
                ]
                if useEmbeddings:
                    transformers.append(('embeddings', SentenceEmbeddingTransformer(), 'Sentence_Embeddings'))
                if useVP:
                    transformers.append(('vp', 'passthrough', ['VERIFIED_PURCHASE']))
                
                # No need for DenseTransformer for other classifiers
                pipes.append(Pipeline([
                    ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                    ('clf', clf)
                ]))
                    
                pipes_names.append(f'{clf.__class__.__name__}_'
                                   f'{vectorizer.__class__.__name__}'
                                   f'{"_WE" if useEmbeddings else ""}'
                                   f'{"_VP" if useVP else ""}')

Nested cross-validation

In [3]:
from nltk import sent_tokenize
import pandas as pd
import joblib
from sklearn.model_selection import HalvingGridSearchCV, StratifiedKFold
from utils.text_preprocess import LemmaTokenizer


param_grids = {
    'RandomForestClassifier': {
        'clf__n_estimators': [800, 1000, 1200],
        'clf__criterion': ['entropy', 'log_loss'],
        'clf__max_depth': [None, 20, 40, 60],
        'clf__bootstrap': [False, True],
    },
    'CalibratedClassifierCV': {
        'clf__estimator__C': [0.5, 1, 1.5],
        'clf__estimator__max_iter': [2000, 4000],
        'clf__estimator__loss': ['hinge', 'squared_hinge'],
    },
}

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = {}
best_accuracy = 0
best_model = None

for pipe, pipe_name in zip(pipes, pipes_names):
    print(pipe_name)
    clf_name = pipe.steps[-1][1].__class__.__name__
    grid = param_grids[clf_name]
    clf = HalvingGridSearchCV(pipe, grid, cv=inner_cv, factor=3, random_state=42, n_jobs=-1)
    results[pipe_name] = [[], [], [], []]
    
    # Loop instead of cross_val_score to have more control over printing
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(df, df['LABEL'])):
        print(f"Fold {fold}:")
        
        # Split the data
        X_train, X_test = df.iloc[train_idx], df.iloc[test_idx]
        y_train, y_test = df['LABEL'].iloc[train_idx], df['LABEL'].iloc[test_idx]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results[pipe_name][0].append(accuracy_score(y_test, y_pred))
        results[pipe_name][1].append(f1_score(y_test, y_pred, pos_label='__label1__'))
        results[pipe_name][2].append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
        results[pipe_name][3].append(clf.best_params_)
        
        # Update best model 
        if results[pipe_name][0][-1] > best_accuracy:
            best_accuracy = results[pipe_name][0][-1]
            best_model = clf.best_estimator_

        print(f"Best params: {clf.best_params_}\n"
              f"Test accuracy: { results[pipe_name][0][-1]}\n"
              f"Test F1: { results[pipe_name][1][-1]}\n"
              f"Test AUC: { results[pipe_name][2][-1]}\n")
    
    print(f'Mean Accuracy: {np.mean(results[pipe_name][0])}\n'
          f'Mean F1: {np.mean(results[pipe_name][1])}\n'
          f'Mean AUC: {np.mean(results[pipe_name][2])}\n')

# save results to csv file
results_str = {key: [str(value) for value in values] for key, values in results.items()}

df = pd.DataFrame()
for key, value in results_str.items():
    temp_df = pd.DataFrame([value], columns=['accuracy', 'f1_score', 'AUC', 'hyperparameters'])
    temp_df.insert(0, 'model', key)
    df = pd.concat([df, temp_df], ignore_index=True)
df.to_csv('results2.csv', index=False)

# save best model
joblib.dump(best_model, 'models/best_model_2.pkl')

CalibratedClassifierCV_TfidfVectorizer_WE_VP
Fold 0:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7985714285714286
Test F1: 0.7971223021582734
Test AUC: 0.8566149659863946

Fold 1:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7985714285714286
Test F1: 0.7943607194944093
Test AUC: 0.863936507936508

Fold 2:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8080952380952381
Test F1: 0.8055957549445248
Test AUC: 0.8621886621315193

Fold 3:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7980952380952381
Test F1: 0.7947725072604066
Test AUC: 0.8575210884353742

Fold 4:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8052380952380952
Test F1: 0.8023199613339778
Test AUC: 0.8658049886621315

Fold 5:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8061904761904762
Test F1: 0.8005879470847623
Test AUC: 0.8754766439909296

Fold 6:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8033333333333333
Test F1: 0.7998061076102763
Test AUC: 0.8659555555555555

Fold 7:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8285714285714286
Test F1: 0.8249027237354085
Test AUC: 0.8822721088435374

Fold 8:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7995238095238095
Test F1: 0.7957302280446386
Test AUC: 0.8502430839002268

Fold 9:
Best params: {'clf__estimator__C': 1.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8080952380952381
Test F1: 0.8076372315035799
Test AUC: 0.864160544217687

Mean Accuracy: 0.8054285714285714
Mean F1: 0.8022835483170259
Mean AUC: 0.8644174149659865

CalibratedClassifierCV_TfidfVectorizer_WE
Fold 0:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6333333333333333
Test F1: 0.6326335877862596
Test AUC: 0.6878938775510204

Fold 1:
Best params: {'clf__estimator__C': 0.5, 'cl



Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8004761904761905
Test F1: 0.7980722891566265
Test AUC: 0.8552108843537416

Fold 1:
Best params: {'clf__estimator__C': 1.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8023809523809524
Test F1: 0.7994200096665055
Test AUC: 0.8631102040816326

Fold 2:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8080952380952381
Test F1: 0.8061568061568062
Test AUC: 0.8618875283446712

Fold 3:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.799047619047619
Test F1: 0.7957405614714425
Test AUC: 0.8570548752834468

Fold 4:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8061904761904762
Test F1: 0.803096274794388
Test AUC: 0.8658394557823129

Fold 5:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.81
Test F1: 0.8064046579330422
Test AUC: 0.8730739229024942

Fold 6:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8042857142857143
Test F1: 0.7976366322008862
Test AUC: 0.8684861678004534

Fold 7:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8266666666666667
Test F1: 0.822265625
Test AUC: 0.88144308390022



Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7880952380952381
Test F1: 0.7849202513291446
Test AUC: 0.8407709750566893

Fold 1:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.780952380952381
Test F1: 0.777992277992278
Test AUC: 0.8360607709750567

Fold 2:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.8085714285714286
Test F1: 0.804093567251462
Test AUC: 0.856671201814059

Fold 3:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.7671428571428571
Test F1: 0.7687943262411348
Test AUC: 0.8267455782312925

Fold 4:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7942857142857143
Test F1: 0.7909002904162633
Test AUC: 0.8480000000000001

Fold 5:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8
Test F1: 0.7967086156824782
Test AUC: 0.8531718820861678

Fold 6:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7761904761904762
Test F1: 0.7764034253092293
Test AUC: 0.8419755102040816

Fold 7:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.800952380952381
Test F1: 0.7964946445959105
Test AUC: 0.8620190476190476

Fold 8:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7747619047619048
Test F1: 0.7729236677868458
Test AUC: 0.8251256235827664

Fold 9:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7957142857142857
Test F1: 0.7934520943668752
Test AUC: 0.8527573696145125

Mean Accuracy: 0.7886666666666666
Mean F1: 0.7862683160971622
Mean AUC: 0.8443297959183674

CalibratedClassifierCV_CountVectorizer_WE
Fold 0:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6114285714285714
Test F1: 0.631768953068592
Test AUC: 0.6597124716553286

Fold 1:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.6366666666666667
Test F1: 0.6598305840392331
Test AUC: 0.6914993197278911

Fold 2:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.6161904761904762
Test F1: 0.6306141154903758
Test AUC: 0.6599800453514739

Fold 3:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6190476190476191
Test F1: 0.635036496350365
Test AUC: 0.6660181405895691

Fold 4:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6266666666666667
Test F1: 0.6436363636363637
Test AUC: 0.6671909297052153

Fold 5:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.6323809523809524
Test F1: 0.6445672191528545
Test AUC: 0.6942467120181406

Fold 6:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.6128571428571429
Test F1: 0.6349348899865289
Test AUC: 0.6614639455782313

Fold 7:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6323809523809524
Test F1: 0.6513098464317977
Test AUC: 0.6704589569160999

Fold 8:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.6090476190476191
Test F1: 0.630013519603425
Test AUC: 0.6551981859410431

Fold 9:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6285714285714286
Test F1: 0.6454545454545455
Test AUC: 0.6705877551020408

Mean Accuracy: 0.6225238095238096
Mean F1: 0.6407166533214081
Mean AUC: 0.6696356462585034

CalibratedClassifierCV_CountVectorizer_VP
Fold 0:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7871428571428571
Test F1: 0.7839536007733204
Test AUC: 0.8386639455782313

Fold 1:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.78
Test F1: 0.7757281553398059
Test AUC: 0.8357333333333334

Fold 2:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8080952380952381
Test F1: 0.8046534173533689
Test AUC: 0.8558757369614511

Fold 3:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7980952380952381
Test F1: 0.7963496637848223
Test AUC: 0.8412081632653061

Fold 4:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7961904761904762
Test F1: 0.7922330097087379
Test AUC: 0.8485197278911565

Fold 5:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8038095238095239
Test F1: 0.798828125
Test AUC: 0.8640988662131519

Fold 6:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7823809523809524
Test F1: 0.7826913932477413
Test AUC: 0.8439047619047618

Fold 7:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8042857142857143
Test F1: 0.8003885381253035
Test AUC: 0.8615891156462585

Fold 8:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7819047619047619
Test F1: 0.7802303262955854
Test AUC: 0.8245251700680273

Fold 9:
Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.7828571428571428
Test F1: 0.7824427480916031
Test AUC: 0.8442349206349206

Mean Accuracy: 0.7924761904761904
Mean F1: 0.7897498977720289
Mean AUC: 0.84583537414966

CalibratedClassifierCV_CountVectorizer
Fold 0:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6052380952380952
Test F1: 0.6205949656750572
Test AUC: 0.6534068027210884

Fold 1:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6257142857142857
Test F1: 0.648479427549195
Test AUC: 0.6804571428571429

Fold 2:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6047619047619047
Test F1: 0.6213503649635036
Test AUC: 0.6479165532879819

Fold 3:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6204761904761905
Test F1: 0.63756252842201
Test AUC: 0.6648943310657596

Fold 4:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6152380952380953
Test F1: 0.6370170709793351
Test AUC: 0.6532625850340137

Fold 5:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6257142857142857
Test F1: 0.6404391582799634
Test AUC: 0.6839909297052155

Fold 6:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6176190476190476
Test F1: 0.6423162583518931
Test AUC: 0.6670947845804989

Fold 7:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6276190476190476
Test F1: 0.6471119133574007
Test AUC: 0.6655537414965986

Fold 8:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'squared_hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6033333333333334
Test F1: 0.6232473993668024
Test AUC: 0.6470476190476191

Fold 9:




Best params: {'clf__estimator__C': 0.5, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.6252380952380953
Test F1: 0.6434073402809243
Test AUC: 0.6680997732426304

Mean Accuracy: 0.6170952380952381
Mean F1: 0.6361526427226084
Mean AUC: 0.663172426303855

RandomForestClassifier_TfidfVectorizer_WE_VP
Fold 0:
Best params: {'clf__bootstrap': False, 'clf__criterion': 'entropy', 'clf__max_depth': 60, 'clf__n_estimators': 1200}
Test accuracy: 0.8076190476190476
Test F1: 0.8036929057337221
Test AUC: 0.8686757369614513

Fold 1:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': 60, 'clf__n_estimators': 1200}
Test accuracy: 0.8033333333333333
Test F1: 0.7956457199406235
Test AUC: 0.8578829931972789

Fold 2:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': 60, 'clf__n_estimators': 1200}
Test accuracy: 0.8128571428571428
Test F1: 0.8072584600294261
Test AUC: 0.8608426303854876

Fold 3:
Best pa



Best params: {'clf__bootstrap': False, 'clf__criterion': 'log_loss', 'clf__max_depth': None, 'clf__n_estimators': 1200}
Test accuracy: 0.6385714285714286
Test F1: 0.650713299585826
Test AUC: 0.688027664399093

Fold 1:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': None, 'clf__n_estimators': 800}
Test accuracy: 0.6347619047619047
Test F1: 0.6540369869192603
Test AUC: 0.6935768707482993

Fold 2:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': 60, 'clf__n_estimators': 800}
Test accuracy: 0.6195238095238095
Test F1: 0.6339899221255153
Test AUC: 0.6713668934240363

Fold 3:
Best params: {'clf__bootstrap': False, 'clf__criterion': 'entropy', 'clf__max_depth': 60, 'clf__n_estimators': 1200}
Test accuracy: 0.6352380952380953
Test F1: 0.6524500907441017
Test AUC: 0.6852879818594104

Fold 4:
Best params: {'clf__bootstrap': False, 'clf__criterion': 'log_loss', 'clf__max_depth': 40, 'clf__n_estimators': 1200}
Test accuracy



Best params: {'clf__bootstrap': False, 'clf__criterion': 'log_loss', 'clf__max_depth': None, 'clf__n_estimators': 800}
Test accuracy: 0.63
Test F1: 0.65015758667267
Test AUC: 0.6764371882086169

Fold 9:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': 60, 'clf__n_estimators': 800}
Test accuracy: 0.6157142857142858
Test F1: 0.630325240494732
Test AUC: 0.6646648526077098

Mean Accuracy: 0.6351428571428571
Mean F1: 0.6513258430756702
Mean AUC: 0.6871772335600907

RandomForestClassifier_CountVectorizer_VP
Fold 0:
Best params: {'clf__bootstrap': False, 'clf__criterion': 'log_loss', 'clf__max_depth': 60, 'clf__n_estimators': 1000}
Test accuracy: 0.8042857142857143
Test F1: 0.7925290257445734
Test AUC: 0.8699990929705216

Fold 1:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': None, 'clf__n_estimators': 1000}
Test accuracy: 0.7980952380952381
Test F1: 0.7838939857288482
Test AUC: 0.8599283446712017

Fold 2:
Best params:



Best params: {'clf__bootstrap': False, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__n_estimators': 800}
Test accuracy: 0.8095238095238095
Test F1: 0.8005982053838484
Test AUC: 0.8630893424036281

Mean Accuracy: 0.8053333333333335
Mean F1: 0.7921503485299014
Mean AUC: 0.8710356916099773

RandomForestClassifier_CountVectorizer
Fold 0:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': 40, 'clf__n_estimators': 800}
Test accuracy: 0.6247619047619047
Test F1: 0.6552930883639545
Test AUC: 0.6805124716553288

Fold 1:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'log_loss', 'clf__max_depth': 60, 'clf__n_estimators': 800}
Test accuracy: 0.6466666666666666
Test F1: 0.6734154929577465
Test AUC: 0.7107437641723356

Fold 2:
Best params: {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': 40, 'clf__n_estimators': 1200}
Test accuracy: 0.6142857142857143
Test F1: 0.6481320590790617
Test AUC: 0.682477097505669

Fold 3:
Best

['models/best_model.pkl']

## Statistical comparison

Test normality

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

model1 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('vectorizer', TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), 'REVIEW_TEXT'),
        ('vp', 'passthrough', ['VERIFIED_PURCHASE']),
        ('embeddings', SentenceEmbeddingTransformer(), 'Sentence_Embeddings'),
    ], n_jobs=-1)),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1)),
])
model2 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('vectorizer', TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), 'REVIEW_TEXT'),
        ('vp', 'passthrough', ['VERIFIED_PURCHASE']),
        ('embeddings', SentenceEmbeddingTransformer(), 'Sentence_Embeddings'),
    ], n_jobs=-1)),
    ('clf', LinearSVC(random_state=42, dual="auto")),
])

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Store the differences between model performances
differences = []

for train_index, test_index in rkf.split(df, df['LABEL']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df['LABEL'].iloc[train_index], df['LABEL'].iloc[test_index]

    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    
    y_pred1 = model1.predict(X_test)
    y_pred2 = model2.predict(X_test)
    
    acc1 = accuracy_score(y_test, y_pred1)
    acc2 = accuracy_score(y_test, y_pred2)
    
    differences.append(acc1 - acc2)

differences = np.array(differences)

# Histogram
sns.histplot(differences, kde=True)
plt.show()

# Q-Q plot
from scipy import stats
stats.probplot(differences, dist="norm", plot=plt)
plt.show()

Perform Wilcoxon signed-rank test

In [None]:
from scipy.stats import wilcoxon

w_p_value = wilcoxon(differences)
print('Wilcoxon signed-rank test p-value:', w_p_value)