In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from scipy.stats import chi2_contingency, ttest_ind
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

def compute_p_value(feature, X, y, categorical_features):
    if feature in categorical_features:
        # Chi-squared test for categorical features
        contingency_table = pd.crosstab(X[feature], y)
        try:
            _, p, _, _ = chi2_contingency(contingency_table)
        except ValueError:
            p = np.nan
    else:
        # t-test for continuous features
        try:
            group1 = X[y == y.unique()[0]][feature]
            group2 = X[y == y.unique()[1]][feature]
            _, p = ttest_ind(group1, group2, equal_var=False, nan_policy='omit')
        except Exception:
            p = np.nan
    return p


def run_rfecv(df, target_col, model, X, y, scoring='accuracy'):
    results_dir = f"{target_col}_RFECV"
    os.makedirs(results_dir, exist_ok=True)

    model_name = type(model).__name__

    selector = RFECV(
        estimator=model,
        step=1,
        cv=StratifiedKFold(5),
        scoring=scoring,
        n_jobs=-1
    )

    all_cat = [
        "FHLBankID", "FIPSStateCode", "FIPSCountyCode", "MSA", "Purpose", "NumBor",
        "CoRace", "BoGender", "CoGender", "Borrower Credit Score",
        "Co-Borrower Credit Score", "Self"
    ]

    categorical_features = [col for col in all_cat if col in X.columns]

    selector.fit(X, y)

    selected_features = X.columns[selector.support_]
    try:
        importances = selector.estimator_.feature_importances_
    except AttributeError:
        try:
            importances = np.abs(selector.estimator_.coef_).flatten()
        except AttributeError:
            importances = np.zeros(len(selected_features))

    importance_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    importance_df['p-value'] = importance_df['Feature'].apply(
    lambda f: compute_p_value(f, X[selected_features], y, categorical_features))

    importance_csv_path = os.path.join(results_dir, f"{model_name}_rfecv_importances.csv")
    importance_df.to_csv(importance_csv_path, index=False)

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(selector.cv_results_['mean_test_score']) + 1),
             selector.cv_results_['mean_test_score'], marker='o')
    plt.xlabel("Number of Selected Features")
    plt.ylabel(f"CV Score ({scoring})")
    plt.title(f"RFECV - {model_name}")
    plt.grid(True)
    plt.tight_layout()
    plot_path = os.path.join(results_dir, f"{model_name}_rfecv_plot.png")
    plt.savefig(plot_path)
    plt.close()
    
    print(f"{model_name}: Best {selector.n_features_} features selected. Results saved to {importance_csv_path}")


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, random_state=42), 
    CatBoostClassifier(verbose=0, random_state=42),
]

In [None]:
df = pd.read_csv("Samples/BoGender.csv")
X = df.drop(columns=["BoGender", "CoGender"])
y = df["BoGender"]
for model in models:
    run_rfecv(df, "BoGender", model, X, y)

RandomForestClassifier: Best 19 features selected. Results saved to BoGender_RFECV\RandomForestClassifier_rfecv_importances.csv
GradientBoostingClassifier: Best 8 features selected. Results saved to BoGender_RFECV\GradientBoostingClassifier_rfecv_importances.csv
CatBoostClassifier: Best 28 features selected. Results saved to BoGender_RFECV\CatBoostClassifier_rfecv_importances.csv
RandomForestClassifier: Best 27 features selected. Results saved to BoRace_RFECV\RandomForestClassifier_rfecv_importances.csv
GradientBoostingClassifier: Best 26 features selected. Results saved to BoRace_RFECV\GradientBoostingClassifier_rfecv_importances.csv


KeyboardInterrupt: 

In [None]:

# ==========================================
df = pd.read_csv("Samples/BoRace.csv")
X = df.drop(columns=["BoRace", "CoRace"])
y = df["BoRace"]
for model in models:
    run_rfecv(df, "BoRace", model, X, y)
# ==========================================

# ==========================================

RandomForestClassifier: Best 27 features selected. Results saved to BoRace_RFECV\RandomForestClassifier_rfecv_importances.csv
GradientBoostingClassifier: Best 26 features selected. Results saved to BoRace_RFECV\GradientBoostingClassifier_rfecv_importances.csv
CatBoostClassifier: Best 24 features selected. Results saved to BoRace_RFECV\CatBoostClassifier_rfecv_importances.csv


In [None]:
# ==========================================
df = pd.read_csv("Samples/CoGender.csv")
X = df.drop(columns=["BoGender", "CoGender"])
y = df["CoGender"]
for model in models:
    run_rfecv(df, "CoGender", model, X, y)
# ==========================================

# ==========================================

RandomForestClassifier: Best 9 features selected. Results saved to CoGender_RFECV\RandomForestClassifier_rfecv_importances.csv
GradientBoostingClassifier: Best 16 features selected. Results saved to CoGender_RFECV\GradientBoostingClassifier_rfecv_importances.csv
CatBoostClassifier: Best 24 features selected. Results saved to CoGender_RFECV\CatBoostClassifier_rfecv_importances.csv


In [None]:

# ==========================================
df = pd.read_csv("Samples/CoRace.csv")
X = df.drop(columns=["BoRace", "CoRace"])
y = df["CoRace"]
for model in models:
    run_rfecv(df, "CoRace", model, X, y)
# ==========================================

RandomForestClassifier: Best 28 features selected. Results saved to CoRace_RFECV\RandomForestClassifier_rfecv_importances.csv
GradientBoostingClassifier: Best 27 features selected. Results saved to CoRace_RFECV\GradientBoostingClassifier_rfecv_importances.csv
CatBoostClassifier: Best 27 features selected. Results saved to CoRace_RFECV\CatBoostClassifier_rfecv_importances.csv
