### Imports and Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.decomposition import PCA, KernelPCA
import umap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from utils.models import model_mapping
from IPython.display import display, Markdown
import warnings
import os

# suppressing OpenMP warnings that aren't necessary
os.environ["KMP_WARNINGS"] = "0"
warnings.filterwarnings("ignore", message=".*omp_set_nested routine deprecated.*")

# suppressing warnings that aren't critical   
warnings.filterwarnings("ignore", message=".*'force_all_finite' was renamed to 'ensure_all_finite'.*")
warnings.filterwarnings("ignore", message=".*n_jobs value 1 overridden to 1 by setting random_state.*")

### Loading Best Model

In [2]:
with open("./data/best_model.txt", "r") as file:
    best_model_name = file.readline().strip().replace("Best Model: ", "")
         
if best_model_name in model_mapping:
    optimal_model = model_mapping[best_model_name]()
    display(Markdown(f"Using **{best_model_name}** as the best model for feature evaluation."))
else:
    raise ValueError(f"Model '{best_model_name}' not found in model_mapping. Please check best_model.txt.")

Using **Non-linear SVM** as the best model for feature evaluation.

### Dummy Data

In [3]:
X, y = make_classification(n_samples=500, n_features=5000, n_informative=100, 
                           n_redundant=50, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

# split
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

### Encoding, Scaling, and Feature Prep

In [4]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

feature_counts = [1] + list(range(10, 201, 10)) # official
#feature_counts = [1, 100, 200] # for temporary convenience
feature_selection_results = {}
feature_extraction_results = {}

best_model = optimal_model

### Feature Selection Methods

In [5]:
def select_features(X_train, y_train, X_test, method, k=100):
    selector = SelectKBest(method, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

print(f"Evaluating Feature Selection with:")

for k in feature_counts:
    print(f"{k} features")
    
    # selecting the features
    X_train_chi2, X_test_chi2 = select_features(X_train_scaled, y_train, X_test_scaled, chi2, k)
    X_train_mi, X_test_mi = select_features(X_train_scaled, y_train, X_test_scaled, mutual_info_classif, k)
    X_train_anova, X_test_anova = select_features(X_train_scaled, y_train, X_test_scaled, f_classif, k)

    # training the model
    best_model.train(X_train_chi2, y_train)
    chi2_pred_probs = best_model.model.predict_proba(X_test_chi2)[:, 1]
    
    best_model.train(X_train_mi, y_train)
    mi_pred_probs = best_model.model.predict_proba(X_test_mi)[:, 1]
    
    best_model.train(X_train_anova, y_train)
    anova_pred_probs = best_model.model.predict_proba(X_test_anova)[:, 1]

    # storing results
    feature_selection_results[k] = { 
        "Chi-Squared": roc_auc_score(y_test, chi2_pred_probs),
        "Mutual Information": roc_auc_score(y_test, mi_pred_probs),
        "ANOVA F-value": roc_auc_score(y_test, anova_pred_probs)
    }

Evaluating Feature Selection with:
1 features
10 features
20 features
30 features
40 features
50 features
60 features
70 features
80 features
90 features
100 features
110 features
120 features
130 features
140 features
150 features
160 features
170 features
180 features
190 features
200 features


### Feature Extraction Methods

In [6]:
def apply_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca
    
print(f"Evaluating Feature Extraction with:")

model_predictor = next(
    getattr(best_model, attr) for attr in dir(best_model)
    if hasattr(getattr(best_model, attr), 'predict_proba')
)
    
for n in feature_counts:
    print(f"{n} components")
    
    # application of extraction models
    X_train_pca, X_test_pca = apply_pca(X_train_scaled, X_test_scaled, n)
    
    kpca = KernelPCA(n_components=n, kernel='rbf')
    X_train_kpca = kpca.fit_transform(X_train_scaled)
    X_test_kpca = kpca.transform(X_test_scaled)
    
    umap_reducer = umap.UMAP(n_components=n, random_state=42)
    X_train_umap = umap_reducer.fit_transform(X_train_scaled)
    X_test_umap = umap_reducer.transform(X_test_scaled)

    # training the model
    best_model.train(X_train_pca, y_train)
    pca_pred_probs = best_model.model.predict_proba(X_test_pca)[:, 1]
    
    best_model.train(X_train_kpca, y_train)
    kpca_pred_probs = best_model.model.predict_proba(X_test_kpca)[:, 1]
    
    best_model.train(X_train_umap, y_train)
    umap_pred_probs = best_model.model.predict_proba(X_test_umap)[:, 1]

    # storing results
    feature_extraction_results[n] = { 
        "PCA": roc_auc_score(y_test, pca_pred_probs),
        "Kernel PCA": roc_auc_score(y_test, kpca_pred_probs),
        "UMAP": roc_auc_score(y_test, umap_pred_probs)
    }

Evaluating Feature Extraction with:
1 components
10 components
20 components
30 components
40 components
50 components
60 components
70 components
80 components
90 components
100 components
110 components
120 components
130 components
140 components
150 components
160 components
170 components
180 components
190 components
200 components


### Printing Results

In [7]:
df_feature_selection = pd.DataFrame.from_dict(feature_selection_results, orient='index')
df_feature_extraction = pd.DataFrame.from_dict(feature_extraction_results, orient='index')

df_feature_selection.index.name = "# of Selected Features"
df_feature_extraction.index.name = "# of Components"

display(Markdown("#### Feature Selection Results:"))
display(df_feature_selection)

display(Markdown("#### Feature Extraction Results:"))
display(df_feature_extraction)

#### Feature Selection Results:

Unnamed: 0_level_0,Chi-Squared,Mutual Information,ANOVA F-value
# of Selected Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.642424,0.475152,0.641818
10,0.760808,0.486263,0.727273
20,0.699798,0.563636,0.841616
30,0.690707,0.55596,0.821616
40,0.702222,0.629899,0.818182
50,0.688081,0.668687,0.779798
60,0.745051,0.684848,0.758788
70,0.724444,0.675152,0.758788
80,0.72202,0.650505,0.741414
90,0.72,0.605455,0.753131


#### Feature Extraction Results:

Unnamed: 0_level_0,PCA,Kernel PCA,UMAP
# of Components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.571717,0.444444,0.499394
10,0.665253,0.60202,0.525051
20,0.552323,0.589899,0.560404
30,0.58101,0.622626,0.516768
40,0.569697,0.587879,0.472121
50,0.572121,0.617374,0.557374
60,0.53697,0.585455,0.552929
70,0.573535,0.612929,0.528889
80,0.631919,0.614545,0.570101
90,0.623838,0.601616,0.529697
