In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv(r"C:\Users\Ire\Desktop\Blossom Academy\LOAN_PROJECT\loan_dataset.csv")

In [5]:
df.head()

Unnamed: 0,age,monthly_income,employment_type,loan_amount,loan_duration_months,previous_loans,previous_defaults,account_age_months,num_dependents,education_level,residential_status,state,has_bank_account,credit_score,loan_default
0,33,100000,Self_Employed,68527,2,5,2,47,5,OND,Renting,Port_Harcourt,1,379,1
1,39,150000,Self_Employed,380577,3,5,0,44,0,MSc,Own_House,Enugu,1,705,0
2,46,100000,Salary_Earner,80542,3,5,1,34,3,BSc,Renting,Lagos,1,479,0
3,37,30000,Business_Owner,300898,6,0,2,9,1,MSc,Renting,Ibadan,1,821,1
4,44,80000,Self_Employed,286532,6,1,2,6,5,Secondary,Renting,Enugu,1,576,1


In [None]:
df.info()

In [None]:
null_vals = df.isna().sum()
null_vals

In [None]:
# Set the visual style
#plt.style.use('seaborn-v0_8-whitegrid')

# 1. Create the figure and the countplot
plt.figure(figsize=(7, 5))
ax = sns.countplot(x='loan_default', data=df, palette='viridis')

# 2. Add the count labels on top of each bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (2, 5), 
                textcoords = 'offset points',
                fontsize=11,
                fontweight='bold')

# 3. Add titles and labels
plt.title('Count of Defaults vs Non-Defaults (0=No, 1=Yes)', fontsize=14)
plt.ylabel('Number of Applicants', fontsize=12)
plt.xlabel('Loan Default Status', fontsize=12)
plt.grid(False)

# Show/Save the plot
plt.tight_layout()
plt.savefig('target_distribution_labeled.png')

In [None]:
# C. Correlation Heatmap
plt.figure(figsize=(12, 10))
# Select only numerical columns for correlation
numerical_df = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlGn', fmt=".2f", linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.savefig('correlation_heatmap.png')

In [None]:
# D. Default Rate by Employment Type
plt.figure(figsize=(10, 6))
emp_default = df.groupby('employment_type')['loan_default'].mean().sort_values(ascending=False)
sns.barplot(x=emp_default.values, y=emp_default.index, palette='viridis', hue=emp_default.index, legend=False)
plt.title('Probability of Default by Employment Type')
plt.xlabel('Probability of Default')
plt.savefig('employment_default_rate.png')

In [None]:
# F. Default Rate by Residential Status
plt.figure(figsize=(10, 6))
res_default = df.groupby('residential_status')['loan_default'].mean().sort_values(ascending=False)
sns.barplot(x=res_default.values, y=res_default.index, palette='viridis', hue=res_default.index, legend=False)
plt.title('Probability of Default by Residential Status')
plt.xlabel('Probability of Default')
plt.savefig('residence_default_rate.png')

In [None]:
# 1. Outlier Detection using IQR
num_cols = ['age', 'monthly_income', 'loan_amount', 'credit_score', 'account_age_months']
outlier_summary = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = len(outliers)

print("Outlier counts per column:", outlier_summary)

In [None]:
# E. Boxplots for Outlier Visualization
plt.figure(figsize=(15, 8))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.savefig('outlier_boxplots.png')

In [7]:
df2 = df.copy()

### Feature Engineering and Data pre-processing

In [None]:
# 1. Check your min and max to be safe
print(df2['credit_score'].min(), df2['credit_score'].max())

In [10]:
# FEATURE 1: Debt-to-Income Ratio (DTI)
df2["debt_to_income_ratio"] = df2["loan_amount"] / df2["monthly_income"]

# FEATURE 2: Payment-to-Income Ratio
df2["estimated_monthly_payment"] = (df2["loan_amount"] / df["loan_duration_months"])

df2["payment_to_income_ratio"] = (df2["estimated_monthly_payment"] / df2["monthly_income"])


# FEATURE 3: Default History Ratio
df2["default_history_ratio"] = (df2["previous_defaults"] / (df2["previous_loans"] + 1))
    

#  FEATURE 4: Income per Dependent

df2["income_per_dependent"] = (df2["monthly_income"] / (df2["num_dependents"] + 1))


# FEATURE 5: Credit Score Bands
df2["credit_score_band"] = pd.cut(
                                 df2["credit_score"],
                                 bins=[300, 500, 650, 750, 850],
                                labels=["Very_Poor", "Poor", "Good", "Excellent"],
                                include_lowest=True
                                )

In [12]:
df2.head()

Unnamed: 0,age,monthly_income,employment_type,loan_amount,loan_duration_months,previous_loans,previous_defaults,account_age_months,num_dependents,education_level,...,state,has_bank_account,credit_score,loan_default,debt_to_income_ratio,estimated_monthly_payment,payment_to_income_ratio,default_history_ratio,income_per_dependent,credit_score_band
0,33,100000,Self_Employed,68527,2,5,2,47,5,OND,...,Port_Harcourt,1,379,1,0.68527,34263.5,0.342635,0.333333,16666.666667,Very_Poor
1,39,150000,Self_Employed,380577,3,5,0,44,0,MSc,...,Enugu,1,705,0,2.53718,126859.0,0.845727,0.0,150000.0,Good
2,46,100000,Salary_Earner,80542,3,5,1,34,3,BSc,...,Lagos,1,479,0,0.80542,26847.333333,0.268473,0.166667,25000.0,Very_Poor
3,37,30000,Business_Owner,300898,6,0,2,9,1,MSc,...,Ibadan,1,821,1,10.029933,50149.666667,1.671656,2.0,15000.0,Excellent
4,44,80000,Self_Employed,286532,6,1,2,6,5,Secondary,...,Enugu,1,576,1,3.58165,47755.333333,0.596942,1.0,13333.333333,Poor


In [14]:
# 1. Identify categorical columns
# This selects all columns with data types 'object' or 'category'
categorical_cols = df2.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Categorical Columns found: {categorical_cols}\n")

# 2. Display unique values for each categorical column
for col in categorical_cols:
    unique_vals = df2[col].unique()
    print(f"Column: {col}")
    print(f"Unique Values: {unique_vals}")
    print("." * 30)

Categorical Columns found: ['employment_type', 'education_level', 'residential_status', 'state', 'credit_score_band']

Column: employment_type
Unique Values: ['Self_Employed' 'Salary_Earner' 'Business_Owner' 'Freelancer']
..............................
Column: education_level
Unique Values: ['OND' 'MSc' 'BSc' 'Secondary' 'HND']
..............................
Column: residential_status
Unique Values: ['Renting' 'Own_House' 'Living_with_Parents']
..............................
Column: state
Unique Values: ['Port_Harcourt' 'Enugu' 'Lagos' 'Ibadan' 'Kano' 'Abuja']
..............................
Column: credit_score_band
Unique Values: ['Very_Poor', 'Good', 'Excellent', 'Poor']
Categories (4, object): ['Very_Poor' < 'Poor' < 'Good' < 'Excellent']
..............................


In [16]:
# Check for missing values in the entire dataframe
missing_values = df2.isnull().sum()

# Display only columns that have at least one missing value
print("Missing values per column:")
print(missing_values[missing_values > 0])

# Also check for Infinite values (can happen during division in ratio features)
import numpy as np
inf_values = df2.isin([np.inf, -np.inf]).sum()
print("\nInfinite values per column:")
print(inf_values[inf_values > 0])

Missing values per column:
Series([], dtype: int64)

Infinite values per column:
Series([], dtype: int64)


In [18]:
# 1. Ordinal Encoding for Education (Hierarchy matters)
education_map = {'Secondary': 1, 'OND': 2, 'HND': 3, 'BSc': 4, 'MSc': 5}
df2['education_level'] = df2['education_level'].map(education_map)

# 2. One-Hot Encoding for the rest (Flexibility matters)
# This includes: employment_type, residential_status, state, AND credit_score_band
df_final = pd.get_dummies(df2, 
                          columns=['employment_type', 'residential_status', 'state', 'credit_score_band'], 
                          drop_first=True, dtype=int)

### Data modelling, Machine learning and Evaluation

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [23]:
# Prepare Features and Target
X = df_final.drop('loan_default', axis=1)
y = df_final['loan_default']

# Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling (After Splitting)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# 4. Define the 6 Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000,random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5,random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100,
                                  max_depth = 10, 
                                  min_samples_split = 10, 
                                  random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, 
                                      learning_rate = 0.1, 
                                      max_depth = 4,
                                      random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, 
                                      learning_rate = 0.1, 
                                      max_depth = 5,
                                      random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(n_estimators=100, 
                            learning_rate = 0.1, 
                            max_depth = 5, num_leaves=31,
                            random_state=42,verbose=-1)
}

In [27]:
# Training and Evaluation Loop


model_results = []

for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Calculate Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    model_results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })

#  Display Comparison Table


results_df = pd.DataFrame(model_results).sort_values(by='Recall', ascending=False)
print("--- Model Performance Comparison ---")
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Model Performance Comparison ---
                 Model  Accuracy  Precision    Recall  F1-Score
4              XGBoost     0.929   0.904858  0.949045  0.926425
3    Gradient Boosting     0.927   0.904472  0.944798  0.924195
5             LightGBM     0.925   0.902439  0.942675  0.922118
1        Decision Tree     0.892   0.855186  0.927813  0.890020
2        Random Forest     0.911   0.902954  0.908705  0.905820
0  Logistic Regression     0.852   0.822355  0.874735  0.847737


In [29]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
4,XGBoost,0.929,0.904858,0.949045,0.926425
3,Gradient Boosting,0.927,0.904472,0.944798,0.924195
5,LightGBM,0.925,0.902439,0.942675,0.922118
1,Decision Tree,0.892,0.855186,0.927813,0.89002
2,Random Forest,0.911,0.902954,0.908705,0.90582
0,Logistic Regression,0.852,0.822355,0.874735,0.847737


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# 1. Create the Pipeline (Scale then Model)
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier(eval_metric='logloss', random_state=42))
])

# 2. Define the Grid with your specific n_estimators
# We use 'classifier__' because it points to the model inside the pipeline
xgb_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.8, 1.0]
}

# 3. Set up the GridSearch
# cv=5 provides robust cross-validation
grid_xgb = GridSearchCV(
    xgb_pipeline, 
    xgb_param_grid, 
    cv=5, 
    scoring='recall', 
    n_jobs=-1, 
    verbose=1 # This will show you the progress as it trains
)

# 4. Fit the search
grid_xgb.fit(X_train, y_train)

# 5. Output results
print(f"\nBest Parameters: {grid_xgb.best_params_}")
print(f"Best CV Recall Score: {grid_xgb.best_score_:.4f}")

# Save the absolute best model
best_xgb_final2 = grid_xgb.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
Best CV Recall Score: 0.9378


In [34]:
# 1. Use the best estimator to make predictions on the unseen test set
# The pipeline handles the scaling of X_test automatically
y_pred = best_xgb_final2.predict(X_test)

# 2. Calculate Final Metrics
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1-Score": f1_score(y_test, y_pred)
}

print("--- FINAL TUNED XGBOOST: TEST SET PERFORMANCE ---")
for metric, value in metrics.items():
    print(f"{metric:10}: {value:.4f}")

--- FINAL TUNED XGBOOST: TEST SET PERFORMANCE ---
Accuracy  : 0.9240
Precision : 0.9006
Recall    : 0.9427
F1-Score  : 0.9212


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the matrix
cm = confusion_matrix(y_test, y_pred_tuned)

# Plotting
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Approved (No Default)', 'Flagged (Default)'], 
            yticklabels=['Actual No Default', 'Actual Default'])

plt.title('Final Decision Impact: Tuned XGBoost', fontsize=15)
plt.xlabel('Model Prediction', fontsize=12)
plt.ylabel('Actual Outcome', fontsize=12)
plt.show()

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_tuned)

# Normalize by row (axis=1) to show Recall and Specificity
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt=".2%", cmap='Greens',
            xticklabels=['Predicted No Default', 'Predicted Default'],
            yticklabels=['Actual No Default', 'Actual Default'])

plt.title('Normalized Confusion Matrix (Recall focus)')
plt.show()

In [36]:
import joblib

# 1. 'best_xgb_final2' should be your Pipeline object
# 2. 'xgb_loan_model.joblib' is the name of the file to be created
# 3. 'compress=3' reduces file size without losing data quality

joblib.dump(best_xgb_final2, 'xgb_loan_model.joblib', compress=3)

print("✅ Model saved successfully as 'xgb_loan_model.joblib'")

✅ Model saved successfully as 'xgb_loan_model.joblib'


In [38]:
# Load the model back into a new variable
test_loader = joblib.load('xgb_loan_model.joblib')

# Try a dummy prediction using one row from your X_test
sample_prediction = test_loader.predict(X_test.iloc[[0]])
print(f"Sanity Check Prediction: {sample_prediction}")

Sanity Check Prediction: [1]


In [40]:
import os

file_size = os.path.getsize('xgb_loan_model.joblib') / (1024 * 1024)
print(f"Model File Size: {file_size:.2f} MB")

Model File Size: 0.14 MB
