In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [57]:
# Step 1: Load the dataset
data = pd.read_csv("C:/Users/Hemalatha/Downloads/Loan_Dataset.csv")

In [58]:
# Inspect the data
print(data.head())        # View the first few rows
print(data.info())        # Check data types and missing values
print(data.describe())    # Get summary statistics for numerical columns

  customer_id          name  age gender owns_car owns_house  no_of_children  \
0  CST_142525          Siva   52      F        Y          N             0.0   
1  CST_129215         Scott   48      F        N          N             1.0   
2  CST_138443      Victoria   50      F        N          N             1.0   
3  CST_123812  John McCrank   30      F        N          N             1.0   
4  CST_144450      Martinne   52      M        N          Y             0.0   

   net_yearly_income  no_of_days_employed occupation_type  \
0          232640.53                998.0         Unknown   
1          284396.79               1338.0         Unknown   
2          149419.28               1210.0         Unknown   
3          160437.54                503.0        Laborers   
4          233480.37                157.0        Laborers   

   total_family_members  yearly_debt_payments  credit_limit  \
0                   2.0              14406.73      26524.40   
1                   3.0         

In [59]:
# Check for missing values
print(data.isnull().sum())

# Handle missing values (example: fill missing numerical data with mean and categorical with mode)
data.fillna(data.mean(numeric_only=True), inplace=True)  # For numerical columns
data.fillna(data.mode().iloc[0], inplace=True)          # For categorical columns

# Encode categorical variables (if any)
data_encoded = pd.get_dummies(data, drop_first=True)

# Scale numerical features (optional, useful for distance-based models like SVM, KNN)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_encoded)

# Convert back to DataFrame
data_encoded = pd.DataFrame(scaled_data, columns=data_encoded.columns)


customer_id                    0
name                           0
age                            0
gender                         0
owns_car                     132
owns_house                     0
no_of_children               190
net_yearly_income              0
no_of_days_employed          105
occupation_type                0
total_family_members          31
yearly_debt_payments       11208
credit_limit                  22
credit_limit_used(%)           0
credit_score                   0
prev_defaults                  3
default_in_last_6months        0
Unnamed: 17                  175
dtype: int64


In [60]:
# Step 2: Preprocess the Data
# Handle missing values (e.g., fill with median or drop rows with many missing values)
data = data.fillna(data.median(numeric_only=True))

In [61]:
# Convert categorical columns to numerical using LabelEncoder
categorical_columns = data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col].astype(str))

# Define features (X) and target (y)
X = data.drop(columns=['default_in_last_6months'])  # Replace with actual target column name
y = data['default_in_last_6months']

In [62]:
# Step 3: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# Step 4: Train Gradient Boosting Model
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

GradientBoostingClassifier()

In [64]:
# Step 5: Evaluate the Model
# Predict on the test set
y_pred = model.predict(X_test)

In [68]:
from sklearn.metrics import roc_auc_score

# Correct calculation of ROC-AUC for multi-class
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
print("ROC-AUC Score:", roc_auc)


ROC-AUC Score: 0.9971594890243461


In [69]:
# Print results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.9846288976723759

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      2145
           1       0.92      0.89      0.91       113
           2       0.33      0.37      0.35        19

    accuracy                           0.98      2277
   macro avg       0.75      0.75      0.75      2277
weighted avg       0.99      0.98      0.98      2277

ROC-AUC Score: 0.9971594890243461


In [70]:
# Feature importance
importances = model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print("\nFeature Importances:\n", feature_importance.sort_values(by='Importance', ascending=False))


Feature Importances:
                  Feature  Importance
16           Unnamed: 17    0.736158
15         prev_defaults    0.170162
13  credit_limit_used(%)    0.023897
14          credit_score    0.022639
0            customer_id    0.008843
6         no_of_children    0.006492
1                   name    0.006363
10  total_family_members    0.005596
8    no_of_days_employed    0.004992
7      net_yearly_income    0.004597
12          credit_limit    0.004334
9        occupation_type    0.003138
11  yearly_debt_payments    0.001255
2                    age    0.001205
4               owns_car    0.000155
5             owns_house    0.000150
3                 gender    0.000025
