In [36]:
import numpy as np
import pandas as pd
np.random.seed(13)

data = np.sort(np.random.uniform(0, 1, 400))
p_default = [1 if d > np.random.rand() else 0  for d in data]
ratings = pd.cut(data, bins=5, labels=["A", "B", "C", "D", "E"])  

data = pd.DataFrame(
    [data, p_default, ratings],
    index=["risk_score", "p_default", "ratings"]
).T

data

Unnamed: 0,risk_score,p_default,ratings
0,0.000012,0,A
1,0.001573,0,A
2,0.005122,0,A
3,0.008716,0,A
4,0.009413,0,A
...,...,...,...
395,0.984938,1,E
396,0.98918,1,E
397,0.991371,1,E
398,0.994132,1,E


In [37]:
data.groupby("ratings").agg(count_loans = ("ratings", "count"), avg_risk_score = ("risk_score", "mean"), avg_p_default = ("p_default", "mean"))

Unnamed: 0_level_0,count_loans,avg_risk_score,avg_p_default
ratings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,96,0.102731,0.114583
B,94,0.286594,0.276596
C,58,0.505447,0.482759
D,73,0.699032,0.780822
E,79,0.896711,0.949367


In [50]:
def bootstrap_confidence_interval(data, num_bootstrap_samples=50, confidence_level=0.95):
    """
    Generate bootstrap confidence intervals for the mean of the data.

    Parameters:
    data (array-like): The data to bootstrap.
    num_bootstrap_samples (int): The number of bootstrap samples to generate.
    confidence_level (float): The confidence level for the interval.

    Returns:
    tuple: Lower and upper bounds of the confidence interval.
    """
    bootstrap_means = []
    n = len(data)
    
    if confidence_level != 0.95:
        raise NotImplementedError("Please no... only 95% confidence intervals are supported.")
    
    for _ in range(num_bootstrap_samples):
        # sample with reposition
        bootstrap_sample = np.random.choice(data, size=n, replace=True)
        bootstrap_means.append(np.mean(bootstrap_sample))
    
    lower_bound = np.percentile(bootstrap_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(bootstrap_means, (1 + confidence_level) / 2 * 100)
    
    return lower_bound, upper_bound


for rating in ["A", "B", "C", "D", "E"]:
    data_array = data[data['ratings'] == rating]['p_default'].astype(int)
    lower, upper = bootstrap_confidence_interval(data_array, num_bootstrap_samples=2500)
    
    mi = np.mean(data_array)
    # let's  
    lower_stat = mi - 1.96 * np.sqrt((mi*(1-mi)) / len(data_array))
    upper_stat = mi + 1.96 * np.sqrt((mi*(1-mi)) / len(data_array))

    print(f"Bootstrap - 95% confidence interval for the mean of rating {rating}: ({lower:.4f}, {upper:.4f})")
    print(f"Bernoulli - 95% confidence interval for the mean of rating {rating}: ({lower_stat:.4f}, {upper_stat:.4f})")
    print('='*50)


Bootstrap - 95% confidence interval for the mean of rating A: (0.0521, 0.1875)
Bernoulli - 95% confidence interval for the mean of rating A: (0.0509, 0.1783)
Bootstrap - 95% confidence interval for the mean of rating B: (0.1915, 0.3723)
Bernoulli - 95% confidence interval for the mean of rating B: (0.1862, 0.3670)
Bootstrap - 95% confidence interval for the mean of rating C: (0.3621, 0.6034)
Bernoulli - 95% confidence interval for the mean of rating C: (0.3542, 0.6114)
Bootstrap - 95% confidence interval for the mean of rating D: (0.6849, 0.8767)
Bernoulli - 95% confidence interval for the mean of rating D: (0.6859, 0.8757)
Bootstrap - 95% confidence interval for the mean of rating E: (0.8987, 0.9873)
Bernoulli - 95% confidence interval for the mean of rating E: (0.9010, 0.9977)
