### Imports and Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.decomposition import PCA, KernelPCA
import umap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from models.rf_model import rf_model
import warnings
import os

# suppressing OpenMP warnings that aren't necessary
os.environ["KMP_WARNINGS"] = "0"
warnings.filterwarnings("ignore", message=".*omp_set_nested routine deprecated.*")

# suppressing warnings that aren't critical   
warnings.filterwarnings("ignore", message=".*'force_all_finite' was renamed to 'ensure_all_finite'.*")
warnings.filterwarnings("ignore", message=".*n_jobs value 1 overridden to 1 by setting random_state.*")

### Dummy Data

In [2]:
X, y = make_classification(n_samples=500, n_features=5000, n_informative=100, 
                           n_redundant=50, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

# split
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

### Encoding, Scaling, and Feature Prep

In [3]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

feature_counts = [1] + list(range(10, 201, 10)) # official
#feature_counts = [1, 100, 200] # for temporary convenience
feature_selection_results = {}
feature_extraction_results = {}

# just chose a random model for now
rf = rf_model()

### Feature Selection Methods

In [4]:
def select_features(X_train, y_train, X_test, method, k=100):
    selector = SelectKBest(method, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

print(f"Evaluating Feature Selection with:")

for k in feature_counts:
    print(f"{k} features")
    
    # selecting the features
    X_train_chi2, X_test_chi2 = select_features(X_train_scaled, y_train, X_test_scaled, chi2, k)
    X_train_mi, X_test_mi = select_features(X_train_scaled, y_train, X_test_scaled, mutual_info_classif, k)
    X_train_anova, X_test_anova = select_features(X_train_scaled, y_train, X_test_scaled, f_classif, k)

    # training the model
    rf.train(X_train_chi2, y_train)
    chi2_pred_probs = rf.rf.predict_proba(X_test_chi2)[:, 1]
    
    rf.train(X_train_mi, y_train)
    mi_pred_probs = rf.rf.predict_proba(X_test_mi)[:, 1]
    
    rf.train(X_train_anova, y_train)
    anova_pred_probs = rf.rf.predict_proba(X_test_anova)[:, 1]

    # storing results
    feature_selection_results[k] = { 
        "Chi-Squared": roc_auc_score(y_test, chi2_pred_probs),
        "Mutual Information": roc_auc_score(y_test, mi_pred_probs),
        "ANOVA F-value": roc_auc_score(y_test, anova_pred_probs)
    }

Evaluating Feature Selection with:
1 features
10 features
20 features
30 features
40 features
50 features
60 features
70 features
80 features
90 features
100 features
110 features
120 features
130 features
140 features
150 features
160 features
170 features
180 features
190 features
200 features


### Feature Extraction Methods

In [5]:
def apply_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca
    
print(f"Evaluating Feature Extraction with:")
    
for n in feature_counts:
    print(f"{n} components")
    
    # application of extraction models
    X_train_pca, X_test_pca = apply_pca(X_train_scaled, X_test_scaled, n)
    
    kpca = KernelPCA(n_components=n, kernel='rbf')
    X_train_kpca = kpca.fit_transform(X_train_scaled)
    X_test_kpca = kpca.transform(X_test_scaled)
    
    umap_reducer = umap.UMAP(n_components=n, random_state=42)
    X_train_umap = umap_reducer.fit_transform(X_train_scaled)
    X_test_umap = umap_reducer.transform(X_test_scaled)

    # training the model
    rf.train(X_train_pca, y_train)
    pca_pred_probs = rf.rf.predict_proba(X_test_pca)[:, 1]
    
    rf.train(X_train_kpca, y_train)
    kpca_pred_probs = rf.rf.predict_proba(X_test_kpca)[:, 1]
    
    rf.train(X_train_umap, y_train)
    umap_pred_probs = rf.rf.predict_proba(X_test_umap)[:, 1]

    # storing results
    feature_extraction_results[n] = { 
        "PCA": roc_auc_score(y_test, pca_pred_probs),
        "Kernel PCA": roc_auc_score(y_test, kpca_pred_probs),
        "UMAP": roc_auc_score(y_test, umap_pred_probs)
    }

Evaluating Feature Extraction with:
1 components
10 components
20 components
30 components
40 components
50 components
60 components
70 components
80 components
90 components
100 components
110 components
120 components
130 components
140 components
150 components
160 components
170 components
180 components
190 components
200 components


### Printing Results

In [6]:
print("Feature Selection Results:")
for k, results in feature_selection_results.items():
    print(f"k={k}")
    for method, num in results.items():
        print(f" {method}: {num:.5f}")
    print()
    
print("\nFeature Extraction Results:")
for n, results in feature_extraction_results.items():
    print(f"n={n}")
    for method, num in results.items():
        print(f" {method}: {num:.5f}")
    print()

Feature Selection Results:
k=1
 Chi-Squared: 0.62848
 Mutual Information: 0.49111
 ANOVA F-value: 0.59434

k=10
 Chi-Squared: 0.68121
 Mutual Information: 0.55293
 ANOVA F-value: 0.59596

k=20
 Chi-Squared: 0.61616
 Mutual Information: 0.53232
 ANOVA F-value: 0.78242

k=30
 Chi-Squared: 0.62788
 Mutual Information: 0.55960
 ANOVA F-value: 0.72505

k=40
 Chi-Squared: 0.65838
 Mutual Information: 0.49596
 ANOVA F-value: 0.76485

k=50
 Chi-Squared: 0.52202
 Mutual Information: 0.60566
 ANOVA F-value: 0.62242

k=60
 Chi-Squared: 0.64040
 Mutual Information: 0.65071
 ANOVA F-value: 0.63051

k=70
 Chi-Squared: 0.69071
 Mutual Information: 0.57475
 ANOVA F-value: 0.60020

k=80
 Chi-Squared: 0.64869
 Mutual Information: 0.51677
 ANOVA F-value: 0.59455

k=90
 Chi-Squared: 0.66263
 Mutual Information: 0.64283
 ANOVA F-value: 0.64424

k=100
 Chi-Squared: 0.70101
 Mutual Information: 0.51212
 ANOVA F-value: 0.71616

k=110
 Chi-Squared: 0.69414
 Mutual Information: 0.52000
 ANOVA F-value: 0.53414

