# 0. Setup and Configuration

In [1]:
!pip install -r ../requirements.txt

DEPRECATION: Loading egg at c:\users\krazy\anaconda3\envs\datascience\lib\site-packages\ibapi-10.37.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [2]:
%cd ../

d:\Studying\Self-Studying\Data Science\Credit-risk-scoring


# 1. Data Generator

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path

import src.pipelines.data_generator as dg

np.random.seed(42)

N_APPLICATIONS = 50000

In [4]:
def generate_applications(n: int = N_APPLICATIONS):
    """Generate loan application dataset."""
    print(f"ðŸ”„ Generating {n:,} loan applications...")
    
    applications = []
    
    for i in range(n):
        profile = dg.generate_applicant_profile()
        
        # Calculate default probability
        default_prob = dg.calculate_default_probability(profile)
        
        # Generate number of delinquencies (correlated with default risk)
        if default_prob > 0.3:
            num_delinquencies = np.random.choice([0, 1, 2, 3, 4], p=[0.3, 0.3, 0.2, 0.15, 0.05])
        elif default_prob > 0.15:
            num_delinquencies = np.random.choice([0, 1, 2, 3], p=[0.5, 0.3, 0.15, 0.05])
        else:
            num_delinquencies = np.random.choice([0, 1, 2], p=[0.8, 0.15, 0.05])
        
        # Delinquencies increase default probability
        default_prob += num_delinquencies * 0.08
        default_prob = np.clip(default_prob, 0.01, 0.95)
        
        # Determine default
        default = int(np.random.random() < default_prob)
        
        application = {
            'application_id': f'APP{i+1:07d}',
            **profile,
            'num_delinquencies': num_delinquencies,
            'default': default
        }
        
        applications.append(application)
    
    df = pd.DataFrame(applications)
    
    # Adjust to target default rate
    actual_rate = df['default'].mean()
    print(f"   Initial default rate: {actual_rate*100:.1f}%")
    
    print(f"\nðŸ“Š Dataset Summary:")
    print(f"   Total applications: {len(df):,}")
    print(f"   Default rate: {df['default'].mean()*100:.1f}%")
    print(f"   Avg income: ${df['income'].mean():,.0f}")
    print(f"   Avg loan amount: ${df['loan_amount'].mean():,.0f}")
    print(f"   Avg DTI: {df['debt_to_income'].mean()*100:.1f}%")
    
    return df

In [5]:
df = generate_applications()
print("\nâœ… Data generation complete!")
df

ðŸ”„ Generating 50,000 loan applications...
   Initial default rate: 18.8%

ðŸ“Š Dataset Summary:
   Total applications: 50,000
   Default rate: 18.8%
   Avg income: $61,630
   Avg loan amount: $20,960
   Avg DTI: 21.1%

âœ… Data generation complete!


Unnamed: 0,application_id,age,income,employment_length,loan_amount,loan_purpose,debt_to_income,credit_history_length,num_credit_lines,utilization_rate,home_ownership,num_delinquencies,default
0,APP0000001,43,53980,10,12112,major_purchase,0.131,7,3,22.2,RENT,0,1
1,APP0000002,21,52586,2,25335,vacation,0.142,0,1,19.8,RENT,0,1
2,APP0000003,32,112395,5,44713,medical,0.202,9,4,26.0,MORTGAGE,0,0
3,APP0000004,57,77387,34,40232,debt_consolidation,0.201,34,16,59.5,MORTGAGE,0,0
4,APP0000005,43,83581,22,22466,vacation,0.257,13,4,30.6,RENT,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,APP0049996,43,54288,10,25423,car,0.233,0,3,20.8,RENT,1,1
49996,APP0049997,21,42709,1,24475,other,0.223,1,2,33.2,MORTGAGE,0,1
49997,APP0049998,71,55137,16,15291,home_improvement,0.398,0,3,27.4,OWN,0,0
49998,APP0049999,21,44044,1,10792,major_purchase,0.181,2,4,15.3,MORTGAGE,0,0


# 2. Training

In [10]:
import src.pipelines.training_pipeline as train
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    precision_recall_curve, average_precision_score, roc_curve
)

In [7]:
df, encoders = train.engineer_features(df, verbose = True)

ðŸ”§ Engineering features...
ðŸ”§ Feature Compute: 
- loan_to_income = % of loaning amount to income 
- payment_to_income = % payment to income per month 
- income_employment = % payment to income per month 
- credit_per_line = income * log(1 + employment_length)
- age_group = bin(age_group)
- income_bracket = bin(income_group)


In [16]:
X, y, feature_names = train.prepare_features(df)
feature_names

['age',
 'income',
 'employment_length',
 'loan_amount',
 'debt_to_income',
 'credit_history_length',
 'num_credit_lines',
 'num_delinquencies',
 'utilization_rate',
 'loan_to_income',
 'payment_to_income',
 'income_employment',
 'credit_per_line',
 'loan_purpose_encoded',
 'home_ownership_encoded',
 'age_group_encoded',
 'income_bracket_encoded']

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
# Further split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42, stratify=y_train
)

In [13]:
print(f"\nðŸ“Š Data Split:")
print(f"   Train: {len(X_train):,}")
print(f"   Validation: {len(X_val):,}")
print(f"   Test: {len(X_test):,}")


ðŸ“Š Data Split:
   Train: 34,000
   Validation: 6,000
   Test: 10,000


In [14]:
model, scaler = train.train_model(X_train, y_train, X_val, y_val)

ðŸ”§ Training model...
   Validation AUC: 0.727


In [17]:
# Evaluate
metrics = train.evaluate_model(model, scaler, X_test, y_test, feature_names)

ðŸ“Š Evaluating model...


In [18]:
    # Print results
train.print_results(metrics)

# Save model
# train.save_model(model, scaler, encoders, metrics, feature_names)

print("\nâœ… Training complete!")


CREDIT RISK MODEL RESULTS

ðŸ“Š Model Performance:
   Accuracy: 81.6%
   Precision: 55.2%
   Recall: 10.5%
   F1 Score: 17.6%
   ROC-AUC: 0.740
   PR-AUC: 0.384

ðŸ“Š Confusion Matrix:
   True Negatives:  7,960
   False Positives: 160
   False Negatives: 1,683
   True Positives:  197

ðŸ“Š Top 5 Features:
   1. num_delinquencies: 0.286
   2. income_employment: 0.127
   3. age: 0.114
   4. income: 0.077
   5. utilization_rate: 0.067

ðŸ“Š Credit Score Distribution:
   Mean: 748
   Std: 76
   Range: 406 - 840

âœ… Training complete!
