In [None]:
!pip install ISLP

Collecting ISLP
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from ISLP)
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from ISLP)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from ISLP)
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from ISLP)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting autograd-gamma>=0.3 (from lifelines->ISLP)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->ISLP)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting scipy>=0.9 (from ISLP)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.5 MB/s[0m eta

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
     QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from functools import partial
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit
from sklearn.base import clone
from ISLP.models import sklearn_sm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
df = load_data('Default') #Page 29 Name of datasets
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   default  10000 non-null  category
 1   student  10000 non-null  category
 2   balance  10000 non-null  float64 
 3   income   10000 non-null  float64 
dtypes: category(2), float64(2)
memory usage: 176.1 KB


In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
balance,10000.0,835.374886,483.714985,0.0,481.731105,823.636973,1166.308386,2654.322576
income,10000.0,33516.981876,13336.639563,771.967729,21340.462903,34552.644802,43807.729272,73554.233495


In [None]:
df['default'].value_counts()

Unnamed: 0_level_0,count
default,Unnamed: 1_level_1
No,9667
Yes,333


In [None]:
df.isna().sum()

Unnamed: 0,0
default,0
student,0
balance,0
income,0


In [None]:
df.duplicated().sum()

0

In [None]:
df['student'] = np.where(df['student'] == 'Yes', 1, 0)
df['default'].head()

Unnamed: 0,default
0,No
1,No
2,No
3,No
4,No


In [None]:

X = df[['income', 'balance']]
y = (df['default'] == 'Yes').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2025, test_size = 0.2)

print(f'The shape of the train is {X_train.shape}')
print(f'The shape of the test is {X_test.shape}')


The shape of the train is (8000, 2)
The shape of the test is (2000, 2)


In [None]:
def model_train(X_train, y_train):
    model = sm.GLM(y_train, X_train, family = sm.families.Binomial()).fit()
    print(summarize(model))
    return model

logistic = model_train(X_train, y_train)


           coef   std err       z  P>|z|
income  -0.0001  0.000004 -30.234    0.0
balance  0.0004  0.000079   4.813    0.0


# Validation Approach

In [None]:
#1
X_train2, X_val , y_train2, y_val = train_test_split(X_train, y_train, random_state = 2025, test_size = 0.25)
print(f'The shape of the train is {X_train.shape}')
print(f'The shape of the validation is {X_val.shape}')

#2
model2 = model_train(X_train2, y_train2)

#3
model2_proba = model2.predict(X_val)
model2_preds = (model2_proba > 0.5).astype(int)

conf_m = confusion_matrix(y_val, model2_preds)
print(conf_m)

#4
validation_error = (1 - accuracy_score(y_val, model2_preds)) * 100
print(f'the validation score is: {validation_error:.2f} %')

The shape of the train is (8000, 2)
The shape of the validation is (2000, 2)
           coef   std err       z  P>|z|
income  -0.0001  0.000005 -25.798    0.0
balance  0.0004  0.000093   3.831    0.0
[[1928    0]
 [  72    0]]
the validation score is: 3.60 %


# Validation 3 times

In [None]:
from sklearn.model_selection import KFold

accuracy = []
for i in range(1,4):
    X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=i)

    model = sm.GLM(y_train2, X_train2, family = sm.families.Binomial()).fit()
    model_proba = model.predict(X_val)
    model_preds = (model_proba > 0.5).astype(int)

    accuracy.append(accuracy_score(y_val, model_preds))

accuracy_mean = np.mean(accuracy)
print(f'The accuracy obtained in {i} iteration was: {accuracy_mean:.4f} ')
print(f'The error rate was: {((1 - accuracy_mean) * 100):.2f}%')

The accuracy obtained in 3 iteration was: 0.9662 
The error rate was: 3.38%


In [None]:
dummies = pd.get_dummies(df['student'], drop_first=True)
dummies = dummies.astype(int)
dummies.rename(columns = {1: 'isStudent'}, inplace = True)



In [None]:
dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   isStudent  10000 non-null  int64
dtypes: int64(1)
memory usage: 78.2 KB


In [None]:
X = df[['balance', 'income']]
X = X.join(dummies)
y = (df['default'] == 'Yes').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2025, test_size = 0.2)
print(f'Train shape: {X_train.shape}, {y_train.shape}')

Train shape: (8000, 3), (8000,)


In [None]:
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, random_state = 2025, test_size = 0.25)
print(f'Train shape: {X_train2.shape}, {y_train.shape}')
model = model_train(X_train2, y_train2)

Train shape: (6000, 3), (8000,)
             coef   std err       z  P>|z|
balance    0.0028  0.000000  16.839    0.0
income    -0.0002  0.000007 -25.460    0.0
isStudent -3.8411  0.204000 -18.863    0.0


In [None]:
model_proba = model.predict(X_val)
model_preds = (model_proba > 0.5).astype(int)
validation_error = (1 - accuracy_score(y_val, model_preds)) * 100
print(f'the validation score is: {validation_error:.2f} %')


the validation score is: 4.40 %


In [None]:
accuracy = []
for i in range(1,4):
    X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, random_state = i, test_size = 0.25)
    model = model_train(X_train2, y_train2)
    model_proba = model.predict(X_val)
    model_preds = (model_proba > 0.5).astype(int)
    accuracy.append(accuracy_score(y_val, model_preds))

accuracy_mean = np.mean(accuracy)
print(f'The accuracy obtained in {i} iteration was: {accuracy_mean:.4f} ')
print(f'The error rate was: {((1 - accuracy_mean) * 100):.2f}%')


             coef   std err       z  P>|z|
balance    0.0027  0.000000  16.366    0.0
income    -0.0002  0.000007 -25.474    0.0
isStudent -3.6848  0.201000 -18.319    0.0
             coef   std err       z  P>|z|
balance    0.0028  0.000000  17.022    0.0
income    -0.0002  0.000007 -25.687    0.0
isStudent -3.8396  0.203000 -18.901    0.0
             coef   std err       z  P>|z|
balance    0.0027  0.000000  16.526    0.0
income    -0.0002  0.000007 -25.816    0.0
isStudent -3.6796  0.199000 -18.534    0.0
The accuracy obtained in 3 iteration was: 0.9600 
The error rate was: 4.00%


# 6

In [None]:
from sklearn.utils import resample

X = df[['income', 'balance']]
y = (df['default'] == 'Yes').astype(int)
X = sm.add_constant(X)

model = sm.Logit(y, X).fit()
print(model.summary())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2025)
model = model_train(X_train, y_train)



Optimization terminated successfully.
         Current function value: 0.078948
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sat, 11 Jan 2025   Pseudo R-squ.:                  0.4594
Time:                        01:43:58   Log-Likelihood:                -789.48
converged:                       True   LL-Null:                       -1460.3
Covariance Type:            nonrobust   LLR p-value:                4.541e-292
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.5405      0.435    -26.544      0.000     -12.393     -10.688
income      2.081e-05   4.99

The standard errors are really small, which mean the coefficients are really stable.


In [None]:
#Manual Bootstrap
def boot_fn(X_train, y_train, indices):
    sample_X = X_train.iloc[indices]
    sample_y = y_train.iloc[indices]
    X_train1 = sm.add_constant(sample_X[['income', 'balance']])
    model = sm.Logit(sample_y, X_train1).fit(disp=False)
    return model

n_boostrap = 1000
coefficients = []

for _ in range(n_boostrap):
    indices = resample(range(len(X_train)), replace = True) #replace = False == Bagging
    model = boot_fn(X_train, y_train, indices)
    coef = model.params
    coefficients.append(coef)

coefficients = np.array(coefficients)
bootstrap_se = coefficients.std(axis=0)

glm_se = model.bse
comparison = pd.DataFrame({
    'Method': ['glm', 'bootstrap'],
    'Intercept_SE': [glm_se['const'], bootstrap_se[0]],
    'Income_SE': [glm_se['income'], bootstrap_se[1]],
    'Balance_SE': [glm_se['balance'], bootstrap_se[2]]
})

print(comparison)



      Method  Intercept_SE  Income_SE  Balance_SE
0        glm      0.465391   0.000006    0.000250
1  bootstrap      0.486918   0.000006    0.000252


Our parameters are super stable since with 1000 iterations we got 1 down perfect and the other deviated by 0.002, this means our model parameters are robust.