# Libaraies 

In [53]:
# imports
import pandas as pd
import numpy as np
# import warnings

# warnings.filterwarnings('ignore')

# Loading Data Set

In [55]:
loan_df = pd.read_csv('Loan Data.csv')
loan_df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


# Extracting Useful Data

In [56]:
fico_scores = loan_df['fico_score']
defaults = loan_df['default']

# Calculate Log Likelihood 

In [57]:
def calculate_ll(fico_scores, defaults, bucket_boundaries):

    log_likelihood = 0
    for i in range(0, len(bucket_boundaries)-1):
        bucket = fico_scores[(fico_scores >= bucket_boundaries[i]) & (fico_scores < bucket_boundaries[i + 1])]
        bucket_defaults = defaults[(fico_scores >= bucket_boundaries[i]) & (fico_scores < bucket_boundaries[i + 1])]

        n_i = len(bucket) # the number of records in the ith bucket
        k_i = bucket_defaults.sum() # the number of defaults in the ith bucket
        p_i = k_i / n_i if n_i > 0 else 0# pd of the ith bucket 

        log_likelihood += ((k_i * np.log(p_i)) + (n_i - k_i)*np.log(1-p_i))

    return log_likelihood 


# Optimise Buckets 

In [58]:
def optimize(fico_scores, defaults, num_buckets):

    bucket_boundaries = np.linspace(fico_scores.min(), fico_scores.max(), num_buckets + 1) # creates num_buckets + 1 equidistand boundries 

    best_boundaries = bucket_boundaries
    best_log_likelihood = calculate_ll(fico_scores, defaults, bucket_boundaries)

    for _ in range(100):  
        new_boundaries = np.sort(np.random.uniform(fico_scores.min(), fico_scores.max(), num_buckets - 1)) # random boundries between max and min and sorts to be in order 
        new_boundaries = np.concatenate(([fico_scores.min()], new_boundaries, [fico_scores.max()])) # adds min and max to cover the range 
        
        # calculate new log likelihood
        new_log_likelihood = calculate_ll(fico_scores, defaults, new_boundaries) 
        
        # update boundaries if log likelihood improves
        if new_log_likelihood > best_log_likelihood:
            best_log_likelihood = new_log_likelihood
            best_boundaries = new_boundaries
    return best_boundaries


# Parameters 

In [59]:
num_buckets = 5  
optimal_boundaries = optimize(fico_scores, defaults, num_buckets)
optimal_boundaries = np.round(optimal_boundaries, 2)

loan_df['fico_bucket'] = pd.cut(fico_scores, bins=optimal_boundaries, labels=range(1, num_buckets +1))

print("Optimal Bucket Boundaries:", optimal_boundaries)
print(f'\nThis can be interpreted as:\nVery Poor: {optimal_boundaries[0]} - {optimal_boundaries[1]}\nPoor: {optimal_boundaries[1]} - {optimal_boundaries[2]}\nAverage: {optimal_boundaries[2]} - {optimal_boundaries[3]}\nGood: {optimal_boundaries[3]} - {optimal_boundaries[4]}\nVery Good: {optimal_boundaries[4]} - {optimal_boundaries[5]}')

Optimal Bucket Boundaries: [408.   521.65 571.   636.77 688.89 850.  ]

This can be interpreted as:
Very Poor: 408.0 - 521.65
Poor: 521.65 - 571.0
Average: 571.0 - 636.77
Good: 636.77 - 688.89
Very Good: 688.89 - 850.0
