In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('Task 3 and 4_Loan_Data.csv')

# Extract FICO scores and default status
fico_scores = data['fico_score'].values.reshape(-1, 1)
defaults = data['default'].values

# Define the number of buckets
num_buckets = 5

# Use KMeans to find bucket boundaries
kmeans = KMeans(n_clusters=num_buckets, random_state=0)
data['fico_bucket'] = kmeans.fit_predict(fico_scores)

# Calculate MSE for each bucket
bucket_centers = kmeans.cluster_centers_.flatten()
mse = mean_squared_error(fico_scores, [bucket_centers[b] for b in data['fico_bucket']])

print("Bucket Centers:", bucket_centers)
print("Mean Squared Error:", mse)

# Calculate log-likelihood
log_likelihood = 0
for i in range(num_buckets):
    bucket_data = data[data['fico_bucket'] == i]
    ni = len(bucket_data)
    ki = bucket_data['default'].sum()
    pi = ki / ni if ni > 0 else 0
    
    if ni > 0 and pi > 0 and pi < 1:
        log_likelihood += ki * np.log(pi) + (ni - ki) * np.log(1 - pi)

print("Log-Likelihood:", log_likelihood)

# Map FICO scores to ratings (lower rating signifies better credit score)
data['fico_rating'] = data['fico_bucket'].apply(lambda x: num_buckets - x)

# Display sample mapping
print(data[['fico_score', 'fico_bucket', 'fico_rating']].head())

Bucket Centers: [628.66990291 674.98504741 732.81713463 520.55672823 581.19868173]
Mean Squared Error: 298.5909680406373
Log-Likelihood: -4274.599983972711
   fico_score  fico_bucket  fico_rating
0         605            0            5
1         572            4            1
2         602            4            1
3         612            0            5
4         631            0            5
