# Searching for the best preprocessing params

Since we found out Random Forest Classifier get's pretty high results on feature set we wanted to check how it perfoms on different parameter configuration.


In [3]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
sys.path.append(project_root)

print(project_root)

import machine_learning.ml_lib as ML
from data_extractor.data_extractor import load_df, extract_X_y_from_df 
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

In [5]:
from param_configs import preprocessing_param_combinations

In [6]:
class RandomForestModel:
    def __init__(self, preprocess_params=None) -> None:
        super().__init__()
        self.best_model_param_grid = {
            'n_estimators': 300,
            'max_depth': None,
            'min_samples_split': 5,
            'max_features': "sqrt",
            'bootstrap': False,
            'class_weight': 'balanced'
        }
        self.model = RandomForestClassifier(**self.best_model_param_grid)
        self.scaler = StandardScaler()
        self.preprocess_params = preprocess_params or {}

    def train(self, X_train, y_train) -> None:
        """
        Train the model using the provided data.
        """
        features = self._preprocess_data(X_train)
        self.scaler.fit(features)
        preprocessed_X_train = self.scaler.transform(features)
        self.model.fit(X=preprocessed_X_train, y=y_train)

    def predict(self, X_test) -> np.ndarray:
        """
        Predict the labels for the provided data.
        """
        features = self._preprocess_data(X_test)
        preprocessed_X_test = self.scaler.transform(features)
        if self.model is None:
            raise ValueError("Model has not been trained yet. Please call the 'train' method before making predictions.")
        return self.model.predict(preprocessed_X_test)
    
    def _preprocess_data(self, X: np.ndarray) -> np.ndarray:
        """
        Preprocess the data.
        """
        return ML.preprocess_eeg_data(X)
    
    def split_random42(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42, stratify = df['label'])
        return df_train, df_test
    
    def _calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> dict[any, any]:
        return {
            "accuracy": accuracy_score(y_true, y_pred),
            "f1_score": f1_score(y_true, y_pred),
            "confusion_matrix": confusion_matrix(y_true, y_pred)
        }

In [None]:
from datetime import datetime
from tqdm import tqdm
import json
import os

results = []

# Ensure the directory for saving results exists
results_dir = 'results'

os.makedirs(results_dir, exist_ok=True)

for idx, param_combination in enumerate(tqdm(preprocessing_param_combinations, desc="Preprocessing Configurations")):
    try:
        df = load_df("../../data/", **param_combination)
        
        df = df.query("desired_answer == answer and data_type in ['REAL', 'FAKE']")
        df['label'] = df.apply(lambda x: 1 if x.block_no in [1, 3] else 0, axis=1)
        
        X, y = extract_X_y_from_df(df)
        
        if len(y) < 2:
            continue
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y)
        
        model = RandomForestModel(preprocess_params=param_combination)
        model.train(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        result = {
            'params': param_combination,
            'split': 42,
            'accuracy': accuracy,
            'f1_score': f1,
            'timestamp': f'RandomForestModel{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        }
        
        filename = os.path.join(results_dir, f'random_forest_config_{idx}.json')
        with open(filename, 'w') as f:
            json.dump(result, f, indent=4)
        
        
        results.append(result)
        
        
    except Exception as e:
        print(f"An error occurred with parameters {param_combination}: {e}")
        continue
    
best_result = max(results, key=lambda x: x['accuracy'])

print("Best preprocessing parameters:")
print(best_result['params'])
print("Accuracy:", best_result['accuracy'])
print("F1 Score:", best_result['f1_score'])


In [None]:
results

In [None]:
# Find the result with the highest accuracy
best_result = max(results, key=lambda x: x['accuracy'])

print("Best preprocessing parameters:")
print(best_result['params'])
print("Accuracy:", best_result['accuracy'])
print("F1 Score:", best_result['f1_score'])

In [None]:
# best params I found
param_combination = {'lfreq': 0.3, 'hfreq': 70, 'notch_filter': [60], 'baseline': (None, None), 'tmin': 0, 'tmax': 0.6}

df = load_df("../../../data/", **param_combination)

df = df.query("desired_answer == answer and data_type in ['REAL', 'FAKE']")
df['label'] = df.apply(lambda x: 1 if x.block_no in [1, 3] else 0, axis=1)

X, y = extract_X_y_from_df(df)


In [None]:
print(df.head())