In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


train = pd.read_csv("/content/HW4-Train.csv")
valid = pd.read_csv("/content/HW4-Validation.csv")

# Example preprocessing
train = train.dropna()
valid = valid.dropna()

y_train = train['Default_ind']
X_train = train.drop(['Default_ind'], axis=1)

y_valid = valid['Default_ind']
X_valid = valid.drop(['Default_ind'], axis=1)

X_train_dum = pd.get_dummies(X_train, drop_first=True)
X_valid_dum = pd.get_dummies(X_valid, drop_first=True)

X_train_dum, X_valid_dum = X_train_dum.align(X_valid_dum, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dum)
X_valid_scaled = scaler.transform(X_valid_dum)

print("X_train_dum shape:", X_train_dum.shape)
print("X_valid_dum shape:", X_valid_dum.shape)


X_train_dum shape: (16559, 25)
X_valid_dum shape: (2473, 25)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_valid_pred = lr.predict(X_valid_scaled)
y_valid_prob = lr.predict_proba(X_valid_scaled)[:, 1]

accuracy  = accuracy_score(y_valid, y_valid_pred)
precision = precision_score(y_valid, y_valid_pred)
recall    = recall_score(y_valid, y_valid_pred)
f1        = f1_score(y_valid, y_valid_pred)
roc_auc   = roc_auc_score(y_valid, y_valid_prob)
cm        = confusion_matrix(y_valid, y_valid_pred)

print("==== Logistic Regression Performance ====")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

print("\nConfusion Matrix:")
print(cm)


==== Logistic Regression Performance ====
Accuracy:  0.9393
Precision: 0.7260
Recall:    0.2896
F1-score:  0.4141
ROC-AUC:   0.8213

Confusion Matrix:
[[2270   20]
 [ 130   53]]


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# 1. Define and train Random Forest
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_dum, y_train)

# 2. Predict on validation set
y_valid_pred_rf = rf.predict(X_valid_dum)
y_valid_prob_rf = rf.predict_proba(X_valid_dum)[:, 1]

# 3. Evaluation metrics
acc_rf  = accuracy_score(y_valid, y_valid_pred_rf)
pre_rf  = precision_score(y_valid, y_valid_pred_rf)
rec_rf  = recall_score(y_valid, y_valid_pred_rf)
f1_rf   = f1_score(y_valid, y_valid_pred_rf)
auc_rf  = roc_auc_score(y_valid, y_valid_prob_rf)
cm_rf   = confusion_matrix(y_valid, y_valid_pred_rf)

print("==== Random Forest Performance ====")
print(f"Accuracy:  {acc_rf:.4f}")
print(f"Precision: {pre_rf:.4f}")
print(f"Recall:    {rec_rf:.4f}")
print(f"F1-score:  {f1_rf:.4f}")
print(f"ROC-AUC:   {auc_rf:.4f}")

print("\nConfusion Matrix:")
print(cm_rf)


==== Random Forest Performance ====
Accuracy:  0.9406
Precision: 0.7647
Recall:    0.2842
F1-score:  0.4143
ROC-AUC:   0.8530

Confusion Matrix:
[[2274   16]
 [ 131   52]]
