In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTENC
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('BANK LOAN.csv')
# Assume df already exists
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (700, 8)
   SN  AGE  EMPLOY  ADDRESS  DEBTINC  CREDDEBT  OTHDEBT  DEFAULTER
0   1    3      17       12      9.3     11.36     5.01          1
1   2    1      10        6     17.3      1.36     4.00          0
2   3    2      15       14      5.5      0.86     2.17          0
3   4    3      15       14      2.9      2.66     0.82          0
4   5    1       2        0     17.3      1.79     3.06          1


In [2]:
# Simple random sample - 20% of dataset
srs_sample = df.sample(frac=0.20, random_state=42)
print("Sample size:", len(srs_sample))
srs_sample.head()

Sample size: 140


Unnamed: 0,SN,AGE,EMPLOY,ADDRESS,DEBTINC,CREDDEBT,OTHDEBT,DEFAULTER
158,159,2,9,4,13.8,1.35,2.65,1
500,501,1,0,1,7.7,0.48,0.91,0
396,397,2,14,8,17.0,1.14,3.62,0
155,156,2,13,3,19.2,2.8,4.69,0
321,322,1,3,4,11.0,0.29,4.66,1


In [3]:
# Stratify by DEFAULTER
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['DEFAULTER'], random_state=42)

print("Overall DEFAULTER ratio:\n", df['DEFAULTER'].value_counts(normalize=True))
print("Train ratio:\n", train_df['DEFAULTER'].value_counts(normalize=True))
print("Test ratio:\n", test_df['DEFAULTER'].value_counts(normalize=True))

Overall DEFAULTER ratio:
 DEFAULTER
0    0.738571
1    0.261429
Name: proportion, dtype: float64
Train ratio:
 DEFAULTER
0    0.739286
1    0.260714
Name: proportion, dtype: float64
Test ratio:
 DEFAULTER
0    0.735714
1    0.264286
Name: proportion, dtype: float64


In [4]:
# Stratify by AGE
train_by_age, test_by_age = train_test_split(df, test_size=0.2, stratify=df['AGE'], random_state=42)

print("Overall AGE distribution:\n", df['AGE'].value_counts(normalize=True))
print("Train AGE distribution:\n", train_by_age['AGE'].value_counts(normalize=True))

print("Test AGE distribution:\n", test_by_age['AGE'].value_counts(normalize=True))

Overall AGE distribution:
 AGE
2    0.405714
1    0.345714
3    0.248571
Name: proportion, dtype: float64
Train AGE distribution:
 AGE
2    0.405357
1    0.346429
3    0.248214
Name: proportion, dtype: float64
Test AGE distribution:
 AGE
2    0.407143
1    0.342857
3    0.250000
Name: proportion, dtype: float64


In [5]:
# Create the STRATUM column on the original dataframe
df['STRATUM'] = df['DEFAULTER'].astype(str) + '_' + df['AGE'].astype(str)

print("Combined STRATUM distribution in original data:")
print(df['STRATUM'].value_counts(normalize=True).head())

# Stratify by both DEFAULTER and AGE
train_both, test_both = train_test_split(df, test_size=0.2, stratify=df['STRATUM'], random_state=42)

print("Combined STRATUM distribution in train:")
print(train_both['STRATUM'].value_counts(normalize=True).head())

print("Combined STRATUM distribution in test:")
print(test_both['STRATUM'].value_counts(normalize=True).head())

Combined STRATUM distribution in original data:
STRATUM
0_2    0.318571
0_1    0.222857
0_3    0.197143
1_1    0.122857
1_2    0.087143
Name: proportion, dtype: float64
Combined STRATUM distribution in train:
STRATUM
0_2    0.317857
0_1    0.223214
0_3    0.196429
1_1    0.123214
1_2    0.087500
Name: proportion, dtype: float64
Combined STRATUM distribution in test:
STRATUM
0_2    0.321429
0_1    0.221429
0_3    0.200000
1_1    0.121429
1_2    0.085714
Name: proportion, dtype: float64


In [6]:
X = df.drop(columns=['SN', 'DEFAULTER', 'STRATUM'], errors='ignore')
y = df['DEFAULTER']

# Random Over-Sampling
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X, y)

# Random Under-Sampling
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X, y)

print("Before:\n", y.value_counts())
print("After Over-Sampling:\n", pd.Series(y_over).value_counts())
print("After Under-Sampling:\n", pd.Series(y_under).value_counts())

Before:
 DEFAULTER
0    517
1    183
Name: count, dtype: int64
After Over-Sampling:
 DEFAULTER
1    517
0    517
Name: count, dtype: int64
After Under-Sampling:
 DEFAULTER
0    183
1    183
Name: count, dtype: int64


In [12]:
#Creates 10 samples of size 700 each with replacement
n_boot = 10
boot_means = []

for i in range(n_boot):
    sample = df.sample(frac=1.0, replace=True)
    print(sample.shape)
    boot_means.append(sample['DEBTINC'].mean())
boot_means

(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)
(700, 9)


[np.float64(10.988999999999999),
 np.float64(10.12557142857143),
 np.float64(10.197285714285714),
 np.float64(10.249142857142857),
 np.float64(10.134142857142859),
 np.float64(10.198571428571428),
 np.float64(10.270142857142858),
 np.float64(10.232857142857142),
 np.float64(10.541714285714285),
 np.float64(10.706857142857142)]