In [8]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_selection import mutual_info_classif, RFECV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr
# import warnings
# warnings.filterwarnings("ignore")

# Load your data (replace this with your actual DataFrame)
# df = pd.read_csv('Data/Sampled_data.csv')
df = pd.read_csv('Data/Sampled_data.csv')
X = df.drop(columns=["Borrower Credit Score"])
y = df["Borrower Credit Score"]

# Ensure all features are numeric
# non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns
# if len(non_numeric_cols) > 0:
#     raise AssertionError(f"Non-numeric columns detected: {list(non_numeric_cols)}. Please ensure all features are numeric.")

# ---------------- Pearson Correlation ---------------- #
pearson_r = []
pearson_p = []
for col in X.columns:
    r, p = pearsonr(X[col], y)
    pearson_r.append(r)
    pearson_p.append(p)

# ---------------- Filter Method ---------------- #
# Compute Mutual Information for classification
top_100_mi_df = pd.DataFrame({
    'Feature': X.columns,
    'Mutual Information Score': mutual_info_classif(X, y, random_state=42),
    'Pearson Correlation (r)': pearson_r,
    'Pearson p-value': pearson_p
}).sort_values(by='Mutual Information Score', ascending=False).reset_index(drop=True).head(100)

# Save to CSV
top_100_mi_df.to_csv("top_100_mi_features.csv", index=False)
print("Top 100 features based on Mutual Information:")
print(top_100_mi_df)

Top 100 features based on Mutual Information:
                     Feature  Mutual Information Score  \
0   Co-Borrower Credit Score                  0.143935   
1                     Income                  0.095773   
2                    TraMedY                  0.088397   
3                        UPB                  0.072888   
4                        LTV                  0.068286   
5                     Amount                  0.066558   
6                       Back                  0.064199   
7                    LocMedY                  0.051771   
8                     NumBor                  0.043966   
9                     MinPer                  0.041722   
10                      Term                  0.036615   
11                       MSA                  0.035803   
12                    BoRace                  0.031529   
13                     Front                  0.031256   
14                      Self                  0.030333   
15                     CoA

In [2]:
# Prepare function to evaluate and save results
def evaluate_and_save_results(X_selected, y_true, method_name):
    start_time = time.time()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y_true, test_size=0.2, random_state=42)

    # Train classifier
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Predict
    y_pred = clf.predict(X_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm)
    cm_df.to_csv(f"confusion_matrix_{method_name}.csv", index=False)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred) * 100
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) * 100
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) * 100
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) * 100
    specificity = np.sum(cm.diagonal()) / np.sum(cm) * 100 if np.sum(cm) > 0 else 0
    runtime = time.time() - start_time

    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy (%)', 'Precision (%)', 'Recall (%)', 'F1-Score (%)', 'Specificity (%)', 'Runtime (s)'],
        'Value': [accuracy, precision, recall, f1, specificity, runtime]
    })
    metrics_df.to_csv(f"metrics_{method_name}.csv", index=False)

# Use the top 100 MI features
X_top100 = X[top_100_mi_df['Feature']]

# Evaluate MI features
evaluate_and_save_results(X_top100, y, "mutual_information")

In [3]:
# ---------------- Wrapper Method (RFECV) ---------------- #

def run_rfecv(model, model_name):
    print(f"\nRunning RFECV for {model_name}...")
    rfecv = RFECV(estimator=model, step=1, cv=5, scoring='accuracy', n_jobs=-1)
    rfecv.fit(X_top100, y)

    support = rfecv.support_
    features_selected = X_top100.columns[support]
    model.fit(X_top100[features_selected], y)
    importance_scores = model.feature_importances_

    selected_df = pd.DataFrame({
        'Feature': features_selected,
        'Importance Score': importance_scores
    }).sort_values(by='Importance Score', ascending=False).reset_index(drop=True)

    selected_df.to_csv(f"selected_features_{model_name}.csv", index=False)
    print(f"Selected features using {model_name}:")
    print(selected_df)

    # Evaluate and save results
    evaluate_and_save_results(X[features_selected], y, model_name)

In [4]:
# Random Forest
run_rfecv(RandomForestClassifier(n_estimators=100, random_state=42), "random_forest")


Running RFECV for random_forest...
Selected features using random_forest:
                      Feature  Importance Score
0                      Income          0.142315
1                     TraMedY          0.133827
2                       BoAge          0.111802
3                         UPB          0.111777
4                      Amount          0.110275
5                     LocMedY          0.104171
6                     CurAreY          0.102058
7                      MinPer          0.091897
8  Co-Borrower Credit Score_5          0.091877


In [5]:
# Decision Tree
run_rfecv(DecisionTreeClassifier(random_state=42), "decision_tree")


Running RFECV for decision_tree...
Selected features using decision_tree:
   Feature  Importance Score
0  TraMedY               1.0


In [6]:

# Extra Trees
run_rfecv(ExtraTreesClassifier(n_estimators=100, random_state=42), "extra_trees")


Running RFECV for extra_trees...
Selected features using extra_trees:
                      Feature  Importance Score
0                     TraMedY          0.109598
1                      Income          0.109206
2                       BoAge          0.103824
3                         UPB          0.103169
4                      Amount          0.102642
5                     LocMedY          0.102399
6                     CurAreY          0.102374
7  Co-Borrower Credit Score_5          0.097716
8                      MinPer          0.096309
9                       CoAge          0.072765
