## Load modules and set iterables

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.base import clone

from modules.utils import load_object, load_dataset
from modules.selection import feature_select, create_feature_selected_dataset
from modules.training import fit_model

In [None]:
# Define directory paths for data, models, and SHAP explanations
train_dir = '../data/processed/train'
test_dir = '../data/processed/test'
feature_select_dir = '../data/reduced'
models_dir = '../models'
explanations_dir = '../shap-explanations'

# Retrieve and sort all available dataset names based on model directories
dataset_names = sorted(os.listdir(models_dir))

# Define feature selection aggregation strategies
strategies = ['sum', 'max']

# Thresholds for feature selection based on the 'sum' strategy
# Cumulative importance thresholds for the 'sum' approach
sum_thresholds = [0.7, 0.8, 0.9, 0.95]

# Thresholds for feature selection based on the 'max'
# Minimum SHAP value cutoffs for the 'max' approach
max_thresholds = [0.01, 0.05, 0.1, 0.15]

# List of model types to iterate over during retraining or evaluation
model_types = ['logreg', 'dt', 'rf', 'xgb', 'svc']

## Generate reduced datasets

In [None]:
# Generate feature-selected datasets across datasets, models, and strategies
for dataset in dataset_names:
    set_name_snake = dataset.replace('-', '_')

    print(f"[INFO] Processing dataset: {dataset}")

    # Load processed train/test splits
    train = load_dataset(os.path.join('../data/processed/train', f'{set_name_snake}-train.csv.gz'))
    test = load_dataset(os.path.join('../data/processed/test', f'{set_name_snake}-test.csv.gz'))
    print(f"[INFO] Loaded training and testing data (train shape: {train.shape}, test shape: {test.shape})")

    # Iterate through model types (e.g., rf, xgb, etc.)
    for model_type in model_types:
        print(f"[INFO] Model type: {model_type}")
        try:
            # Load SHAP explanations for this dataset-model combination
            shap_path = os.path.join(
                explanations_dir,
                dataset,
                f'{set_name_snake}-{model_type}-sampling-global.pickle.xz'
            )
            shap_values = load_object(shap_path)['mean_shap']
        except:
            print(f"[ERROR] Unable to load SHAP values from: {shap_path}")
            continue
            
        print(f"[INFO] Loaded SHAP values from: {shap_path}")
        print(f"[INFO] Number of SHAP values: {len(shap_values)}")

        # Loop through feature selection strategies
        for strategy in strategies:
            print(f"[INFO] Strategy: {strategy}")

            # Strategy 1: Cumulative SHAP importance ('sum')
            if strategy == 'sum':
                for sum_threshold in sum_thresholds:
                    print(f"[INFO] Applying 'sum' strategy with threshold = {sum_threshold:.2f}")
                    
                    # Select top features based on cumulative SHAP contribution
                    idx = feature_select(
                        shap_values = shap_values,
                        kind = 'sum',
                        sum_threshold = sum_threshold,
                        # min_strength = 0.01   # Optional parameter to prune low value features
                    )
                    print(f"[INFO] Selected {len(idx) - 1} features.")

                    # Save reduced datasets
                    create_feature_selected_dataset(
                        idx = idx,
                        train = train,
                        test = test,
                        root_dir = feature_select_dir,
                        dataset_name = set_name_snake,
                        model_type = model_type,
                        selection_strategy = strategy,
                        selection_threshold = sum_threshold
                    )
                    print(f"[DONE] Saved reduced datasets for 'sum' strategy ({sum_threshold:.2f}).\n")

            # Strategy 2: Maximum SHAP threshold ('max')
            elif strategy == 'max':
                for max_threshold in max_thresholds:
                    print(f"[INFO] Applying 'max' strategy with threshold = {max_threshold:.2f}")

                    # Select top features based on relative max SHAP value
                    idx = feature_select(
                        shap_values = shap_values,
                        kind = 'max',
                        max_threshold = max_threshold
                    )
                    print(f"[INFO] Selected {len(idx) - 1} features.")

                    # Save reduced datasets
                    create_feature_selected_dataset(
                        idx = idx,
                        train = train,
                        test = test,
                        root_dir = feature_select_dir,
                        dataset_name = set_name_snake,
                        model_type = model_type,
                        selection_strategy = strategy,
                        selection_threshold = max_threshold
                    )
                    print(f"[DONE] Saved reduced datasets for 'max' strategy ({max_threshold:.2f}).\n")

    print(f"[DONE] Completed processing for dataset: {dataset}\n")

## Retrain models on reduced datasets

In [None]:
# Define directory containing feature-reduced training datasets
reduced_train_dir = '../data/reduced/train'

# List all reduced training dataset files available in the directory
reduced_datasets = os.listdir(reduced_train_dir)

In [None]:
# Retrain models on reduced feature subsets for each dataset and model type
for dataset in dataset_names:
    # Convert dataset name to snake_case for consistency in file paths
    set_name_snake = dataset.replace('-', '_')
    print(f'\n[INFO] Processing dataset: {dataset}')

    # Loop through all model types
    for model_type in model_types:
        print(f'[INFO] Loading full {model_type} model for {dataset}...')
        model = load_object(
            os.path.join(models_dir, dataset, f'{set_name_snake}-{model_type}-full.pickle.xz')
        )

        # Select only the reduced datasets matching the current dataset and model type
        filtered_reduced_datasets = [ds for ds in reduced_datasets if set_name_snake in ds and model_type in ds]
        print(f'[INFO] Found {len(filtered_reduced_datasets)} reduced datasets for {model_type}.')

        # Train the model on each reduced dataset
        for reduced_dataset in filtered_reduced_datasets:
            print(f'[INFO] Loading reduced dataset: {reduced_dataset}')
            reduced_train = load_dataset(os.path.join(reduced_train_dir, reduced_dataset))
            X_train = reduced_train.iloc[:, :-1]
            y_train = reduced_train.iloc[:, -1]

            # Extract feature selection type from filename
            selection_type = reduced_dataset.split('.csv.gz')[0].split(f'{model_type}-')[-1]

            # Define save path for the retrained model
            save_path = os.path.join(models_dir, dataset, f'{set_name_snake}-{model_type}-{selection_type}')

            # Retrain and save model without grid search using LZMA compression
            print(f'[INFO] Retraining {model_type} with selection method: {selection_type}')
            fit_model(
                X_train = X_train,
                y_train = y_train,
                model_name = '',
                model = clone(model),
                grid_search = False,
                save = True,
                save_path = save_path,
                compression = 'lzma'
            )

            print(f'[DONE] {model_type} ({selection_type}) retrained and saved for {dataset}.\n')

    print(f'[DONE] Completed retraining for all model types on {dataset}.\n')