# A08 Bootstrapping

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv('Default.csv')

In [5]:
data

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.134700
2,No,No,1073.549164,31767.138950
3,No,No,529.250605,35704.493940
4,No,No,785.655883,38463.495880
...,...,...,...,...
9995,No,No,711.555020,52992.378910
9996,No,No,757.962918,19660.721770
9997,No,No,845.411989,58636.156980
9998,No,No,1569.009053,36669.112360


In [6]:
data['default'] = data['default'].astype("category")

In [7]:
data['student'] = data['student'].astype("category")

In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.base import clone


data = pd.read_csv('Default.csv')


df_boot = data.copy(deep=True)

y = (data['default'] == 'Yes').astype(int).values
X = data[['balance']].values

base_model = LogisticRegression(max_iter=1000)
base_model.fit(X, y)

b0_orig = float(base_model.intercept_)
b1_orig = float(base_model.coef_[0, 0])

print("Modelo original")
print(f"  β0 (intercepto): {b0_orig:.6f}")
print(f"  β1 (balance)  : {b1_orig:.8f}")


n_boot = 1000
b0s = np.empty(n_boot)
b1s = np.empty(n_boot)

n = len(df_boot)

for i in range(n_boot):
    sample = resample(df_boot, replace=True, n_samples=n, random_state=i)

    Xb = sample[['balance']].values
    yb = (sample['default'] == 'Yes').astype(int).values

    m = clone(base_model)             
    m.fit(Xb, yb)

    b0s[i] = m.intercept_[0]
    b1s[i] = m.coef_[0, 0]

def resumen(arr, alpha=0.05):
    lo = 100*(alpha/2)
    hi = 100*(1 - alpha/2)
    return pd.Series({
        'media': arr.mean(),
        'desv_std': arr.std(ddof=1),
        'IC95_inf': np.percentile(arr, lo),
        'IC95_sup': np.percentile(arr, hi),
    })

res_b0 = resumen(b0s)
res_b1 = resumen(b1s)

summary = pd.DataFrame({
    'parametro': ['β0_intercepto', 'β1_balance'],
    'media_boot': [res_b0['media'], res_b1['media']],
    'desv_std_boot': [res_b0['desv_std'], res_b1['desv_std']],
    'IC95_inf': [res_b0['IC95_inf'], res_b1['IC95_inf']],
    'IC95_sup': [res_b0['IC95_sup'], res_b1['IC95_sup']],
    'valor_original': [b0_orig, b1_orig],
})
summary['original_en_IC95'] = (summary['valor_original'] >= summary['IC95_inf']) & \
                              (summary['valor_original'] <= summary['IC95_sup'])

print("\nBootstrap (1000) de coeficientes logísticos usando la COPIA del DF:")
print(summary.round(6))


Modelo original
  β0 (intercepto): -10.651328
  β1 (balance)  : 0.00549892

Bootstrap (1000) de coeficientes logísticos usando la COPIA del DF:
       parametro  media_boot  desv_std_boot   IC95_inf  IC95_sup  \
0  β0_intercepto  -10.681448       0.363339 -11.418449 -9.957325   
1     β1_balance    0.005518       0.000222   0.005067  0.005960   

   valor_original  original_en_IC95  
0      -10.651328              True  
1        0.005499              True  


In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample

data = pd.read_csv("Advertising.csv")


X = data[['TV']].values
y = data['sales'].values


modelo = LinearRegression()
modelo.fit(X, y)


b0_orig = float(modelo.intercept_)
b1_orig = float(modelo.coef_[0])

print("Modelo original:")
print(f"  β₀ (intercepto): {b0_orig:.6f}")
print(f"  β₁ (pendiente): {b1_orig:.6f}")


n_boot = 1000
b0_list = []
b1_list = []

for i in range(n_boot):
    X_res, y_res = resample(X, y, replace=True, random_state=i)
    m = LinearRegression()
    m.fit(X_res, y_res)
    b0_list.append(m.intercept_)
    b1_list.append(m.coef_[0])

b0 = np.array(b0_list)
b1 = np.array(b1_list)


def resumen(param):
    return {
        'media': np.mean(param),
        'desviacion_std': np.std(param, ddof=1),
        'IC95_inf': np.percentile(param, 2.5),
        'IC95_sup': np.percentile(param, 97.5)
    }

resumen_b0 = resumen(b0)
resumen_b1 = resumen(b1)


print("\nResultados Bootstrap β₀ (intercepto):")
print(resumen_b0)
print("\nResultados Bootstrap β₁ (pendiente):")
print(resumen_b1)


Modelo original:
  β₀ (intercepto): 7.032594
  β₁ (pendiente): 0.047537

Resultados Bootstrap β₀ (intercepto):
{'media': 7.048023249032241, 'desviacion_std': 0.3311766638642434, 'IC95_inf': 6.386966094034086, 'IC95_sup': 7.663651505592669}

Resultados Bootstrap β₁ (pendiente):
{'media': 0.04746536224946573, 'desviacion_std': 0.002855424922601339, 'IC95_inf': 0.042062135563057185, 'IC95_sup': 0.05303737811885239}


In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV


data = pd.read_csv("Advertising.csv")


X = data[['TV']].values
y = data['sales'].values


alphas = np.logspace(-4, 4, 100)
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid={'alpha': alphas}, cv=5)
grid.fit(X, y)

best_alpha = grid.best_params_['alpha']
print(f"Mejor valor de alpha encontrado: {best_alpha:.6f}")


n_boot = 1000
b0_list = []
b1_list = []

for i in range(n_boot):
    X_res, y_res = resample(X, y, replace=True, random_state=i)
    model = Ridge(alpha=best_alpha)
    model.fit(X_res, y_res)
    b0_list.append(model.intercept_)
    b1_list.append(model.coef_[0])


b0 = np.array(b0_list)
b1 = np.array(b1_list)


std_b0 = np.std(b0, ddof=1)
std_b1 = np.std(b1, ddof=1)

print(f"\nDesviación estándar de β₀ (intercepto): {std_b0:.6f}")
print(f"Desviación estándar de β₁ (pendiente):  {std_b1:.6f}")


Mejor valor de alpha encontrado: 10000.000000

Desviación estándar de β₀ (intercepto): 0.329753
Desviación estándar de β₁ (pendiente):  0.002838
