# Model Training

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, HalvingGridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


df = pd.read_csv('data/amazon_reviews.txt', sep='\t')
df['VERIFIED_PURCHASE'] = df['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})
df['REVIEW_LENGTH'] = df['REVIEW_TEXT'].apply(len)

classifiers = [
    RandomForestClassifier(random_state=42, n_jobs=-1),
    GaussianNB(),
    KNeighborsClassifier(n_jobs=-1),
]
vectorizers = [
    tfidf_vectorizer,
    bow_vectorizer,
]

class DenseTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray() if hasattr(X, 'toarray') else X
    
pipes = []
pipes_names = []
for clf in classifiers:
    for vectorizer in vectorizers:
        for useVP in [True, False]:
            transformers = [
                ('vectorizer', vectorizer, 'REVIEW_TEXT'),
            ]
            if useVP:
                transformers.append(('encoder', OneHotEncoder(), ['VERIFIED_PURCHASE']))
            
            if isinstance(clf, GaussianNB):
                # Add DenseTransformer after ColumnTransformer to convert the full sparse matrix to dense
                pipes.append(Pipeline([
                    ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                    ('to_dense', DenseTransformer()),  # Apply DenseTransformer here
                    ('clf', clf)
                ]))
            else:
                # No need for DenseTransformer for other classifiers
                pipes.append(Pipeline([
                    ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                    ('clf', clf)
                ]))
                
            pipes_names.append(f'{clf.__class__.__name__}_{vectorizer.__class__.__name__}_{"VP" if useVP else "noVP"}')

param_grids = {
    'RandomForestClassifier': {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [10, 20, None],
        'clf__min_samples_split': [2, 5, 10],
    },
    'KNeighborsClassifier': {
        'clf__n_neighbors': [3, 5, 7],
        'clf__weights': ['uniform', 'distance'],
        'clf__p': [1, 2],  # p=1 for Manhattan, p=2 for Euclidean
    },
    'GaussianNB': {
        'clf__var_smoothing': [1e-9, 1e-8, 1e-7],
    }
}

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for pipe, pipe_name in zip(pipes, pipes_names):
    print(pipe_name)
    clf_name = pipe.steps[-1][1].__class__.__name__
    grid = param_grids[clf_name]
    clf = HalvingGridSearchCV(pipe, grid, cv=inner_cv, factor=2, random_state=42, n_jobs=-1)
    scores = cross_val_score(clf, X=df, y=df['LABEL'], cv=outer_cv, scoring='accuracy', n_jobs=-1)
    print(scores)
    print('Accuracies: ' + str(scores) + '\n'
          + 'Mean Accuracy: ' + str(scores.mean()) + '\n')
