In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve
import numpy as np
import catboost as cb

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Load the dataset
try:
    df = pd.read_csv('home_credit_synthetic_positive.csv')
except FileNotFoundError:
    print("Error: 'home_credit_synthetic_positive.csv' not found. Please ensure the file is in the 'MultipleFiles' directory.")
    exit()

In [3]:
# Drop irrelevant columns
df = df.drop(['SK_ID_CURR'], axis=1)

In [4]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns identified: {list(categorical_cols)}")

Categorical columns identified: ['NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE']


In [5]:
# Apply one-hot encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [6]:
# Handle missing values (if any remain, fill with median for numerical columns)
# For simplicity, we'll fill any remaining NaNs with the median of their respective columns.
# In a real-world scenario, more sophisticated imputation might be needed.
for col in df.columns:
    if df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"Filled missing values in column '{col}' with median: {median_val}")
        else:
            # For any remaining non-numeric NaNs after one-hot encoding, which shouldn't happen with drop_first=True
            # but as a safeguard, fill with mode or a placeholder.
            mode_val = df[col].mode()[0]
            df[col] = df[col].fillna(mode_val)
            print(f"Filled missing values in non-numeric column '{col}' with mode: {mode_val}")


In [7]:
 #Define features (X) and target (y)
X = df.drop('TARGET', axis=1)
y = df['TARGET']

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Dataset split into training (X_train shape: {X_train.shape}) and testing (X_test shape: {X_test.shape}) sets.")


Dataset split into training (X_train shape: (1400, 20)) and testing (X_test shape: (600, 20)) sets.


In [9]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Numerical features scaled.")

Numerical features scaled.


In [10]:
 #Convert scaled arrays back to DataFrames, preserving column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)


In [11]:
# Logistic Regression
log_reg_model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
log_reg_model.fit(X_train_scaled, y_train)
print("Logistic Regression model trained.")

Logistic Regression model trained.


In [12]:
# CatBoost Classifier
cat_features_indices = [X.columns.get_loc(col) for col in categorical_cols if col in X.columns]
cat_model = cb.CatBoostClassifier(
    iterations=100,
    random_seed=42,
    verbose=0,
    early_stopping_rounds=10,
    eval_metric='F1',
    class_weights=[1, (len(y_train[y_train == 0]) / len(y_train[y_train == 1]))]  # Handle class imbalance
)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
print("CatBoost Classifier model trained.")


CatBoost Classifier model trained.


In [13]:
# --- Define Business Cost Values ---
# Cost of False Positive (FP): Loan approved but defaults
avg_credit_fp = df['AMT_CREDIT'].mean() * 0.5  # Assuming 50% of credit is lost on default
COST_FP = avg_credit_fp
print(f"\nCost of False Positive (FP): {COST_FP:.2f} (e.g., 50% of average credit amount)")



Cost of False Positive (FP): 372575.43 (e.g., 50% of average credit amount)


In [14]:
# Cost of False Negative (FN): Loan rejected but would not default
avg_credit_fn = df['AMT_CREDIT'].mean() * 0.1  # Assuming 10% potential profit
COST_FN = avg_credit_fn
print(f"Cost of False Negative (FN): {COST_FN:.2f} (e.g., 10% of average credit amount as lost profit)")


Cost of False Negative (FN): 74515.09 (e.g., 10% of average credit amount as lost profit)


In [15]:
# Get probabilities for the positive class (default)
y_pred_proba_lr = log_reg_model.predict_proba(X_test_scaled)[:, 1]

In [16]:
# Calculate costs for various thresholds
thresholds = np.linspace(0, 1, 100)
costs = []
for t in thresholds:
    y_pred_lr = (y_pred_proba_lr >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
    total_cost = (fp * COST_FP) + (fn * COST_FN)
    costs.append(total_cost)

In [18]:
# Find the threshold that minimizes the total cost
optimal_threshold_lr = thresholds[np.argmin(costs)]
min_cost_lr = np.min(costs)
print(f"Optimal Threshold for Logistic Regression: {optimal_threshold_lr:.4f}")
print(f"Minimum Total Business Cost for Logistic Regression: {min_cost_lr:.2f}")

Optimal Threshold for Logistic Regression: 0.6566
Minimum Total Business Cost for Logistic Regression: 14083351.37


In [21]:
# Evaluate Logistic Regression with optimal threshold
y_pred_optimal_lr = (y_pred_proba_lr >= optimal_threshold_lr).astype(int)
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_test, y_pred_optimal_lr).ravel()
accuracy_lr = accuracy_score(y_test, y_pred_optimal_lr)
print("\nLogistic Regression Performance with Optimal Threshold:")
print(f"Accuracy: {accuracy_lr:.4f}")
print(f"True Negatives (TN): {tn_lr}")
print(f"False Positives (FP): {fp_lr}")
print(f"False Negatives (FN): {fn_lr}")
print(f"True Positives (TP): {tp_lr}")
print(f"Total Cost: {(fp_lr * COST_FP) + (fn_lr * COST_FN):.2f}")
print("\nOptimizing decision threshold for CatBoost Classifier...")


Logistic Regression Performance with Optimal Threshold:
Accuracy: 0.6850
True Negatives (TN): 409
False Positives (FP): 0
False Negatives (FN): 189
True Positives (TP): 2
Total Cost: 14083351.37

Optimizing decision threshold for CatBoost Classifier...


In [22]:
# Get probabilities for the positive class (default)
y_pred_proba_cat = cat_model.predict_proba(X_test)[:, 1]

In [23]:
# Calculate costs for various thresholds
costs_cat = []
for t in thresholds:
    y_pred_cat = (y_pred_proba_cat >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_cat).ravel()
    total_cost = (fp * COST_FP) + (fn * COST_FN)
    costs_cat.append(total_cost)

In [24]:
# Find the threshold that minimizes the total cost
optimal_threshold_cat = thresholds[np.argmin(costs_cat)]
min_cost_cat = np.min(costs_cat)
print(f"Optimal Threshold for CatBoost Classifier: {optimal_threshold_cat:.4f}")
print(f"Minimum Total Business Cost for CatBoost Classifier: {min_cost_cat:.2f}")

Optimal Threshold for CatBoost Classifier: 0.5556
Minimum Total Business Cost for CatBoost Classifier: 14232381.54


In [25]:
# Evaluate CatBoost with optimal threshold
y_pred_optimal_cat = (y_pred_proba_cat >= optimal_threshold_cat).astype(int)
tn_cat, fp_cat, fn_cat, tp_cat = confusion_matrix(y_test, y_pred_optimal_cat).ravel()
accuracy_cat = accuracy_score(y_test, y_pred_optimal_cat)
print("\nCatBoost Classifier Performance with Optimal Threshold:")
print(f"Accuracy: {accuracy_cat:.4f}")
print(f"True Negatives (TN): {tn_cat}")
print(f"False Positives (FP): {fp_cat}")
print(f"False Negatives (FN): {fn_cat}")
print(f"True Positives (TP): {tp_cat}")
print(f"Total Cost: {(fp_cat * COST_FP) + (fn_cat * COST_FN):.2f}")
print("\n--- Summary ---")
print(f"Logistic Regression Min Cost: {min_cost_lr:.2f} at threshold {optimal_threshold_lr:.4f} with accuracy {accuracy_lr:.4f}")
print(f"CatBoost Classifier Min Cost: {min_cost_cat:.2f} at threshold {optimal_threshold_cat:.4f} with accuracy {accuracy_cat:.4f}")
if min_cost_lr < min_cost_cat:
    print("\nLogistic Regression model with its optimal threshold results in lower total business cost.")
else:
    print("\nCatBoost Classifier model with its optimal threshold results in lower total business cost.")



CatBoost Classifier Performance with Optimal Threshold:
Accuracy: 0.6817
True Negatives (TN): 409
False Positives (FP): 0
False Negatives (FN): 191
True Positives (TP): 0
Total Cost: 14232381.54

--- Summary ---
Logistic Regression Min Cost: 14083351.37 at threshold 0.6566 with accuracy 0.6850
CatBoost Classifier Min Cost: 14232381.54 at threshold 0.5556 with accuracy 0.6817

Logistic Regression model with its optimal threshold results in lower total business cost.
