In [2]:
import pandas as pd
from math import log
import numpy as np

# Load the data from a CSV file into a DataFrame
df = pd.read_csv('Task3n4_Loan_Data.csv')

# Extract the 'default' and 'fico_score' columns as lists
x = df['default'].to_list()  # 1 if defaulted, 0 otherwise
y = df['fico_score'].to_list()  # FICO scores
n = len(x)  # Number of records

# Print lengths of the lists to ensure they are of the same length
print(len(x), len(y))

# Initialize lists to store cumulative counts of defaults and total records for each FICO score
default = [0 for i in range(851)]  # 851 corresponds to the range of FICO scores 0-850
total = [0 for i in range(851)]

# Populate the 'default' and 'total' lists
for i in range(n):
    y[i] = int(y[i])  # Convert FICO score to integer
    default[y[i]-300] += x[i]  # Update the count of defaults for the given FICO score
    total[y[i]-300] += 1  # Update the count of total records for the given FICO score
    
# Compute cumulative sums for the 'default' and 'total' lists
for i in range(1, 551):  # From FICO score 301 to 850
    default[i] += default[i-1]  # Cumulative sum of defaults
    total[i] += total[i-1]  # Cumulative sum of total records
    
# Function to calculate the log-likelihood for a given number of observations and defaults
def log_likelihood(n, k):
    p = k / n
    if (p == 0 or p == 1):
        return 0
    return k * np.log(p) + (n - k) * np.log(1 - p)

# Number of partitions to find
r = 10
# Initialize DP table with a very low initial value
dp = [[[-10**18, 0] for i in range(551)] for j in range(r+1)]

# Fill the DP table
for i in range(r+1):  # For each number of partitions from 0 to r
    for j in range(551):  # For each FICO score index
        if i == 0:
            dp[i][j][0] = 0  # Base case: no partitions, log-likelihood is 0
        else:
            for k in range(j):  # Previous FICO score index
                if total[j] == total[k]:
                    continue  # Skip if total counts are equal (prevent division by zero)
                if i == 1:
                    # If it's the first partition, calculate the log-likelihood directly
                    dp[i][j][0] = log_likelihood(total[j], default[j])
                else:
                    # Update DP value considering the partition at index k
                    potential_value = log_likelihood(total[j] - total[k], default[j] - default[k]) + dp[i-1][k][0]
                    if dp[i][j][0] < potential_value:
                        dp[i][j][0] = potential_value
                        dp[i][j][1] = k  # Record the index of the previous partition
                                                     
# Print the best log-likelihood value found
print(round(dp[r][550][0], 4))
                                                     
# Trace back to find the optimal partition points
k = 550
l = []
while r >= 0:
    l.append(k + 300)  # Store the FICO score value
    k = dp[r][k][1]  # Move to the previous partition index
    r -= 1

# Print the list of partition points
print(l)

10000 10000
-4217.8245
[850, 753, 752, 732, 696, 649, 611, 580, 552, 520, 300]
