# Model Training

In [None]:
from nltk import sent_tokenize
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, HalvingGridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from utils.text_preprocess import LemmaTokenizer

df = pd.read_csv('data/amazon_reviews.txt', sep='\t')
df['VERIFIED_PURCHASE'] = df['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})
df['REVIEW_LENGTH'] = df['REVIEW_TEXT'].apply(len)

classifiers = [
    LinearSVC(random_state=42),
    MultinomialNB(),
    RandomForestClassifier(random_state=42, n_jobs=-1),
]
vectorizers = [
    TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None),
    CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None, max_features=500),
]

word2vec_model_file =  'models/word2vec_200.model'
model = Word2Vec.load(word2vec_model_file)

def average_word_embeddings(sentence, model):
    words = LemmaTokenizer()(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(word_vectors, axis=0)

# Define a function to process each review
def process_review(review_text, model):
    sentences = sent_tokenize(review_text)
    sentence_embeddings = [average_word_embeddings(sentence, model) for sentence in sentences]
    return sentence_embeddings

df['Sentence_Embeddings'] = df['REVIEW_TEXT'].apply(lambda x: process_review(x, model))

class DenseTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray() if hasattr(X, 'toarray') else X

class SentenceEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X should be a list of numpy arrays (each array representing sentence embeddings)
        transformed = []
        for embeddings in X:
            # If embeddings is a list, convert it to a numpy array
            if isinstance(embeddings, list):
                embeddings = np.array(embeddings)
            
            if embeddings.size > 0:
                # Return the average embedding of the sentences
                transformed.append(np.mean(embeddings, axis=0))
            else:
                # Handle case where there are no embeddings
                transformed.append(np.zeros(embeddings.shape[1]))

        return np.array(transformed)
pipes = []
pipes_names = []
for clf in classifiers:
    for vectorizer in vectorizers:
        for useEmbeddings in [True, False]:
            for useVP in [True, False]:
                transformers = [
                    ('vectorizer', vectorizer, 'REVIEW_TEXT'),
                    ('length', 'passthrough', ['REVIEW_LENGTH'])
                ]
                if useEmbeddings:
                    transformers.append(('embeddings', SentenceEmbeddingTransformer(), 'Sentence_Embeddings'))
                if useVP:
                    transformers.append(('encoder', OneHotEncoder(), ['VERIFIED_PURCHASE']))
                
                if isinstance(clf, MultinomialNB):
                    # Add DenseTransformer after ColumnTransformer to convert the full sparse matrix to dense
                    pipes.append(Pipeline([
                        ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                        ('to_dense', DenseTransformer()),  # Apply DenseTransformer here
                        ('clf', clf)
                    ]))
                else:
                    # No need for DenseTransformer for other classifiers
                    pipes.append(Pipeline([
                        ('preprocessor', ColumnTransformer(transformers, n_jobs=-1)),
                        ('clf', clf)
                    ]))
                    
                pipes_names.append(f'{clf.__class__.__name__}_'
                                   f'{vectorizer.__class__.__name__}_'
                                   f'{"WE" if useEmbeddings else ""}_'
                                   f'{"VP" if useVP else ""}')

param_grids = {
    'LinearSVC': {
        'clf__C': np.arange(2, 10, 2),
        'clf__max_iter': [2000, 4000],
        'clf__loss': ['hinge', 'squared_hinge'],
    },
    'RandomForestClassifier': {
        'clf__n_estimators': [100, 200, 400, 800],
        'clf__criterion': ['gini', 'entropy', 'log_loss'],
        'clf__max_depth': [None, 20, 40, 60],
        'clf__bootstrap': [True],
    },
    'MultinomialNB': {
        'clf__alpha': [0.0001, 0.001, 0.1, 0.5, 1.0],
    }
}

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for pipe, pipe_name in zip(pipes, pipes_names):
    print(pipe_name)
    clf_name = pipe.steps[-1][1].__class__.__name__
    grid = param_grids[clf_name]
    clf = HalvingGridSearchCV(pipe, grid, cv=inner_cv, factor=2, random_state=42, n_jobs=-1)
    
    scores = []
    # Loop instead of cross_val_score to have more control over printing
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(df, df['LABEL'])):
        print(f"\nFold {fold}:")
        
        # Split the data
        X_train, X_test = df.iloc[train_idx], df.iloc[test_idx]
        y_train, y_test = df['LABEL'].iloc[train_idx], df['LABEL'].iloc[test_idx]
        tmp = (pipe.named_steps['preprocessor'].fit_transform(X_train))
        print(tmp[0])
        clf.fit(X_train, y_train)
        scores.append(clf.score(X_test, y_test))
        
        print(f"Best params: {clf.best_params_}")
        print(f"Best score: {clf.best_score_}")
        print(f"Test score: {scores[-1]}")
        
    print('Accuracies: ' + str(scores) + '\n'
          + 'Mean Accuracy: ' + str(scores.mean()) + '\n')


  text = BeautifulSoup(text, "html.parser").get_text()


LinearSVC_TfidfVectorizer_WE_VP

Fold 0:
  (0, 852)	0.3146327143841256
  (0, 2572)	0.33762293932767545
  (0, 4106)	0.2844732352491088
  (0, 9077)	0.29783850851486987
  (0, 9493)	0.39766840274834825
  (0, 11251)	0.2529684210489428
  (0, 13232)	0.230142245150199
  (0, 14727)	0.3888962647599638
  (0, 15785)	0.32145160084596025
  (0, 17210)	0.29616572577813016
  (0, 19507)	116.0
  (0, 19508)	1.0
