## Load libraries and define file paths

In [None]:
import os

from modules.selection import run_feature_selection
from modules.training import retrain_on_reduced_features

In [None]:
# Define directory paths for data, models, and SHAP explanations
train_dir = '../data/processed/train'
test_dir = '../data/processed/test'
models_dir = '../models'
save_dir = '../data/other-feature-selection'
metrics_dir = '../metrics'

# Retrieve and sort all available dataset names based on model directories
dataset_names = sorted(os.listdir(models_dir))

## Create reduced datasets

- Mutual Information Gain (Sci-kit Learn library)
- ReliefF (ReliefF library)
- Minimum Redundancy, Maximum Relevance [mRMR] (Feature-engine library)
- Fast Correlation Based Feature Selection [FCBF] (fcbf library)

In [None]:
# Feature percentages to keep, where applicable
k_percentages = [0.05, 0.1, 0.15, 0.2]

In [None]:
# Run feature selection for all datasets and create reduced datasets
run_feature_selection(
    dataset_names = dataset_names,
    k_percentages = k_percentages,
    train_dir = train_dir,
    test_dir = test_dir,
    save_dir = save_dir,
    metrics_dir = metrics_dir
)

## Retrain models on reduced datasets

In [None]:
# Define directory containing feature-reduced training datasets
reduced_train_dir = '../data/other-feature-selection/train'

# List all reduced training dataset files available in the directory
reduced_datasets = os.listdir(reduced_train_dir)

# List of model types to iterate over during retraining
model_types = ['logreg', 'dt', 'rf', 'xgb', 'svc']

In [None]:
# Retrain models on reduced features
retrain_on_reduced_features(
    dataset_names = dataset_names,
    model_types = model_types,
    reduced_datasets = reduced_datasets,
    models_dir = models_dir,
    reduced_train_dir = reduced_train_dir,
    selection_parse_mode = "train",
)