In [3]:
import pandas as pd
import dataprep
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('cleaned_loan_data.csv')

In [45]:
# Separate features and target
X = df.drop(columns=['loan_status'])
y = df['loan_status']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

In [30]:
import optuna
import sklearn

In [47]:
import optuna
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
import torch
from torch import nn, optim
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Simulate a synthetic dataset for demonstration (replace with your actual dataset)
np.random.seed(42)
X = np.random.rand(1000, 20)
y = np.random.choice([0, 1], size=1000, p=[0.95, 0.05])  # Imbalanced binary target

# Stratified split to ensure both classes are present in train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Define the objective function for Optuna optimization
def objective(trial):
    model_name = trial.suggest_categorical('model',
                                           ['Decision Tree', 'Logistic Regression', 'Random Forest', 'XGBoost', 'CatBoost', 'MLP'])

    # **Decision Tree (CPU Only)**
    if model_name == 'Decision Tree':
        model = DecisionTreeClassifier(
            max_depth=trial.suggest_int('dt_max_depth', 3, 10),
            min_samples_split=trial.suggest_int('dt_min_samples_split', 2, 10)
        )

    # **Logistic Regression (CPU Only)**
    elif model_name == 'Logistic Regression':
        model = LogisticRegression(
            C=trial.suggest_loguniform('lr_C', 0.01, 10),
            solver='liblinear'
        )

    # **Random Forest (CPU Only)**
    elif model_name == 'Random Forest':
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int('rf_n_estimators', 50, 200),
            max_depth=trial.suggest_int('rf_max_depth', 3, 10)
        )

    # **XGBoost with GPU**
    elif model_name == 'XGBoost':
        model = XGBClassifier(
            n_estimators=trial.suggest_int('xgb_n_estimators', 50, 200),
            max_depth=trial.suggest_int('xgb_max_depth', 3, 10),
            learning_rate=trial.suggest_float('xgb_learning_rate', 0.01, 0.2),
            use_label_encoder=False,
            eval_metric="logloss",
            tree_method="gpu_hist"
        )

    # **CatBoost with GPU**
    elif model_name == 'CatBoost':
        # Check if multiple classes exist
        if len(set(y_train_resampled)) < 2:
            raise ValueError("Target variable contains only one class. Check data balancing.")
        
        model = CatBoostClassifier(
            iterations=trial.suggest_int('cb_iterations', 50, 200),
            depth=trial.suggest_int('cb_depth', 3, 10),
            learning_rate=trial.suggest_float('cb_learning_rate', 0.01, 0.2),
            task_type="GPU",
            verbose=0
        )

    # **MLP with PyTorch (GPU)**
    elif model_name == 'MLP':
        model = nn.Sequential(
            nn.Linear(X_train_scaled.shape[1], trial.suggest_int('mlp_hidden_1', 50, 200)),
            nn.ReLU(),
            nn.Linear(trial.suggest_int('mlp_hidden_1', 50, 200), 1),
            nn.Sigmoid()
        ).to(device)

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.BCELoss()

        # Move data to GPU
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train_resampled, dtype=torch.float32).to(device)

        # Simple training loop with pruning
        for epoch in range(3):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor).flatten()
            loss = loss_fn(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

            # Report progress for pruning
            trial.report(loss.item(), step=epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        # Move test data to GPU for evaluation
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

        # Predict using PyTorch model
        model.eval()
        with torch.no_grad():
            y_pred = (model(X_test_tensor).flatten() > 0.5).cpu().numpy()
        accuracy = accuracy_score(y_test, y_pred)
        return accuracy

    #For all other models (Scikit-Learn)
    model.fit(X_train_scaled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

#Run Optuna study with pruning enabled
study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=50)

#Display the best model and hyperparameters
print("\n🎯 Best Model and Hyperparameters:")
print(f"Best Model: {study.best_params['model']}")
print(f"Best Accuracy: {study.best_value:.4f}")
print(f"Best Hyperparameters: {study.best_params}")


[I 2025-01-15 16:59:38,312] A new study created in memory with name: no-name-3dd3c1fa-fd06-4957-b00a-fd1e039fa04d


Using device: cpu


[I 2025-01-15 16:59:45,935] Trial 0 finished with value: 0.93 and parameters: {'model': 'CatBoost', 'cb_iterations': 186, 'cb_depth': 9, 'cb_learning_rate': 0.1607536366145697}. Best is trial 0 with value: 0.93.
[I 2025-01-15 16:59:48,901] Trial 1 finished with value: 0.91 and parameters: {'model': 'CatBoost', 'cb_iterations': 170, 'cb_depth': 4, 'cb_learning_rate': 0.08067182156572957}. Best is trial 0 with value: 0.93.
[I 2025-01-15 16:59:50,065] Trial 2 finished with value: 0.86 and parameters: {'model': 'Random Forest', 'rf_n_estimators': 138, 'rf_max_depth': 9}. Best is trial 0 with value: 0.93.
[I 2025-01-15 16:59:51,422] Trial 3 finished with value: 0.88 and parameters: {'model': 'CatBoost', 'cb_iterations': 55, 'cb_depth': 6, 'cb_learning_rate': 0.044106940517464194}. Best is trial 0 with value: 0.93.
[I 2025-01-15 16:59:51,442] Trial 4 finished with value: 0.21 and parameters: {'model': 'MLP', 'mlp_hidden_1': 131}. Best is trial 0 with value: 0.93.
[I 2025-01-15 16:59:54,485] 


🎯 Best Model and Hyperparameters:
Best Model: CatBoost
Best Accuracy: 0.9300
Best Hyperparameters: {'model': 'CatBoost', 'cb_iterations': 186, 'cb_depth': 9, 'cb_learning_rate': 0.1607536366145697}


In [48]:
import joblib

# Retrain the best model with the entire dataset
best_model = CatBoostClassifier(
    iterations=study.best_params['cb_iterations'],
    depth=study.best_params['cb_depth'],
    learning_rate=study.best_params['cb_learning_rate'],
    task_type="GPU",
    verbose=0
)

best_model.fit(X_train_scaled, y_train_resampled)

<catboost.core.CatBoostClassifier at 0x1c219d42680>

In [52]:
# Save the trained model to a file
joblib.dump(best_model, "best_catboost_model.pkl")
print("Best model saved as 'best_catboost_model.pkl'")

Best model saved as 'best_catboost_model.pkl'


In [53]:
# Load the saved model and test it on unseen data
loaded_model = joblib.load("best_catboost_model.pkl")

# Test the loaded model
y_pred_loaded = loaded_model.predict(X_test_scaled)
loaded_accuracy = accuracy_score(y_test, y_pred_loaded)
print(f"Loaded Model Accuracy: {loaded_accuracy:.4f}")

Loaded Model Accuracy: 0.9300


In [70]:
print(type(df))         # Should be <class 'pandas.core.frame.DataFrame'>
print(type(X_train_scaled))  # Should be <class 'numpy.ndarray'>


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [None]:
import matplotlib.pyplot as plt

# Extract feature names from the original DataFrame
feature_names = X_train.columns.tolist()

# Get feature importance from the CatBoost model
feature_importance = best_model.get_feature_importance()

# Sort the features by importance for better visualization
sorted_idx = feature_importance.argsort()
sorted_feature_names = [feature_names[i] for i in sorted_idx]

# Plot feature importance using actual column names
plt.figure(figsize=(12, 6))
plt.barh(sorted_feature_names, feature_importance[sorted_idx])
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.title("CatBoost Feature Importance with Actual Feature Names")
plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'columns'