# Model Training

Prepare dataframe 

In [1]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from nltk import sent_tokenize
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC, SVC
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from utils.text_preprocess import LemmaTokenizer

df = pd.read_csv('data/amazon_reviews.txt', sep='\t')
df['VERIFIED_PURCHASE'] = df['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})

word2vec_model_file = 'models/word2vec_200.model'
model = Word2Vec.load(word2vec_model_file)

def average_word_embeddings(sentence, model):
    words = LemmaTokenizer()(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(word_vectors, axis=0)

# Define a function to process each review
def process_review(review_text, model):
    sentences = sent_tokenize(review_text)
    sentence_embeddings = [average_word_embeddings(sentence, model) for sentence in sentences]
    return sentence_embeddings

df['Sentence_Embeddings'] = df['REVIEW_TEXT'].apply(lambda x: process_review(x, model))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


Create pipelines

In [2]:

class DenseTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray() if hasattr(X, 'toarray') else X

class SentenceEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X should be a list of numpy arrays (each array representing sentence embeddings)
        transformed = []
        for embeddings in X:
            # If embeddings is a list, convert it to a numpy array
            if isinstance(embeddings, list):
                embeddings = np.array(embeddings)
            
            if embeddings.size > 0:
                # Return the average embedding of the sentences
                transformed.append(np.mean(embeddings, axis=0))
            else:
                # Handle case where there are no embeddings
                transformed.append(np.zeros(embeddings.shape[1]))

        return np.array(transformed)
    
classifiers = [
    CalibratedClassifierCV(LinearSVC(random_state=42, dual='auto')),
    RandomForestClassifier(random_state=42, n_jobs=-1),
    MultinomialNB(),
]
vectorizers = [
    TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None),
    CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None),
]

pipes = []
pipes_names = []
for clf in classifiers:
    for vectorizer in vectorizers:
        for useEmbeddings in [True, False]:
            for useVP in [True, False]:
                transformers = [
                    ('vectorizer', vectorizer, 'REVIEW_TEXT')
                ]
                if useEmbeddings:
                    transformers.append(('embeddings', SentenceEmbeddingTransformer(), 'Sentence_Embeddings'))
                if useVP:
                    transformers.append(('encoder', OneHotEncoder(), ['VERIFIED_PURCHASE']))
                
                if isinstance(clf, MultinomialNB):
                    # Add DenseTransformer after ColumnTransformer to convert the full sparse matrix to dense
                    pipes.append(Pipeline([
                        ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                        ('to_dense', DenseTransformer()),  # Apply DenseTransformer here
                        ('clf', clf)
                    ]))
                else:
                    # No need for DenseTransformer for other classifiers
                    pipes.append(Pipeline([
                        ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                        ('clf', clf)
                    ]))
                    
                pipes_names.append(f'{clf.__class__.__name__}_'
                                   f'{vectorizer.__class__.__name__}'
                                   f'_{"WE" if useEmbeddings else ""}'
                                   f'_{"VP" if useVP else ""}')

Nested cross-validation

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from nltk import sent_tokenize
import pandas as pd
import joblib
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, HalvingGridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from utils.text_preprocess import LemmaTokenizer


param_grids = {
    'CalibratedClassifierCV': {
        'clf__estimator__C': [0.5, 1, 1.5],
        'clf__estimator__max_iter': [2000, 4000],
        'clf__estimator__loss': ['hinge', 'squared_hinge'],
    },
    'RandomForestClassifier': {
        'clf__n_estimators': [100, 200, 400, 600],
        'clf__criterion': ['gini', 'entropy', 'log_loss'],
        'clf__max_depth': [None, 20, 40, 60],
        'clf__bootstrap': [False, True],
    },
    'MultinomialNB': {
        'clf__alpha': [0.0001, 0.001, 0.1, 0.5, 1.0],
    }
}

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}
best_accuracy = 0
best_model = None

for pipe, pipe_name in zip(pipes, pipes_names):
    print(pipe_name)
    clf_name = pipe.steps[-1][1].__class__.__name__
    grid = param_grids[clf_name]
    clf = HalvingGridSearchCV(pipe, grid, cv=inner_cv, factor=2, random_state=42, n_jobs=-1)
    
    # Loop instead of cross_val_score to have more control over printing
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(df, df['LABEL'])):
        print(f"\nFold {fold}:")
        
        # Split the data
        X_train, X_test = df.iloc[train_idx], df.iloc[test_idx]
        y_train, y_test = df['LABEL'].iloc[train_idx], df['LABEL'].iloc[test_idx]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results[pipe_name] = [[], [], []]
        results[pipe_name][0].append(accuracy_score(y_test, y_pred))
        results[pipe_name][1].append(f1_score(y_test, y_pred, pos_label='__label1__'))
        results[pipe_name][2].append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
        
        # Update best model 
        if results[pipe_name][0][-1] > best_accuracy:
            best_accuracy = results[pipe_name][0][-1]
            best_model = clf.best_estimator_
        joblib.dump(best_model, 'best_model.pkl')
        print(f"Best params: {clf.best_params_}\n"
              f"Test accuracy: { results[pipe_name][0][-1]}\n"
              f"Test F1: { results[pipe_name][1][-1]}\n"
              f"Test AUC: { results[pipe_name][2][-1]}")
    
    print(f'Mean Accuracy: {np.mean(results[pipe_name][0])}\n'
          f'Mean F1: {np.mean(results[pipe_name][1])}\n'
          f'Mean AUC: {np.mean(results[pipe_name][2])}')

# save results to csv file
results_str = {key: [str(value) for value in values] for key, values in results.items()}

df = pd.DataFrame()
for key, value in results_str.items():
    temp_df = pd.DataFrame([value], columns=['accuracy', 'f1_score', 'AUC'])
    temp_df.insert(0, 'model', key)
    df = pd.concat([df, temp_df], ignore_index=True)
df.to_csv('results.csv', index=False)

# save best model
joblib.dump(best_model, 'best_model.pkl')

CalibratedClassifierCV_TfidfVectorizer_WE_VP

Fold 0:




Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8023809523809524
Test F1: 0.7995169082125603
Test AUC: 0.8620913832199546

Fold 1:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 4000}
Test accuracy: 0.8023809523809524
Test F1: 0.8006724303554275
Test AUC: 0.8635798185941044

Fold 2:
Best params: {'clf__estimator__C': 1, 'clf__estimator__loss': 'hinge', 'clf__estimator__max_iter': 2000}
Test accuracy: 0.8071428571428572
Test F1: 0.8034934497816594
Test AUC: 0.8738383219954649

Fold 3:
