In [7]:
import pandas as pd
import numpy as np

-----------Importing dataset--------------

In [9]:
data = pd.read_csv("training_dataset.csv")

-----------Check the Shape of the Dataset------------

In [57]:
print("Shape of dataset:", data.shape)

Shape of dataset: (18267, 23)


----Display Dataset Information-----------------

In [59]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18267 entries, 0 to 18266
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Cust_id                   18267 non-null  int64  
 1   Age                       18267 non-null  int64  
 2   Income_group              18267 non-null  object 
 3   Existing_Customer         18267 non-null  int64  
 4   Occupation_type           18267 non-null  object 
 5   Education_Qualification   18267 non-null  int64  
 6   Dependents                18267 non-null  int64  
 7   Marital_Status            18267 non-null  object 
 8   Gender                    18267 non-null  object 
 9   Debt_Burden_Ratio         18267 non-null  object 
 10  Years_at_current_address  18267 non-null  int64  
 11  Credit_Amount             18267 non-null  int64  
 12  Housing_1_Own_2_Rent      18267 non-null  int64  
 13  Employed_since_yrs        18267 non-null  int64  
 14  Locati

-------------Summary Statistics of the Dataset---------------

In [61]:
print(data.describe())

           Cust_id           Age  Existing_Customer  Education_Qualification  \
count  18267.00000  18267.000000       18267.000000             18267.000000   
mean   49866.81847     40.719823           0.027755                 2.497400   
std    28915.87710      7.824026           0.164274                 1.115292   
min        3.00000     22.000000           0.000000                 1.000000   
25%    24753.00000     36.000000           0.000000                 2.000000   
50%    50057.00000     42.000000           0.000000                 2.000000   
75%    74816.50000     45.000000           0.000000                 3.000000   
max    99994.00000     55.000000           1.000000                 4.000000   

         Dependents  Years_at_current_address  Credit_Amount  \
count  18267.000000              18267.000000   18267.000000   
mean       0.821098                  8.696338  176105.240105   
std        0.675604                  4.315100   72030.759217   
min        0.000000    

-----------Checking for Missing Values-----------------

In [63]:
print(data.isnull().sum())

Cust_id                     0
Age                         0
Income_group                0
Existing_Customer           0
Occupation_type             0
Education_Qualification     0
Dependents                  0
Marital_Status              0
Gender                      0
Debt_Burden_Ratio           0
Years_at_current_address    0
Credit_Amount               0
Housing_1_Own_2_Rent        0
Employed_since_yrs          0
Location                    0
DPD_30_6M                   0
DPD_60_6M                   0
Months_on_book              0
Default_Flag                0
Occupation_Partner          0
Job_Seniority               0
Assets                      0
Savings                     0
dtype: int64


------------Target Variable distribution--------------

In [65]:
print(data['Default_Flag'].value_counts(normalize=True) * 100)

Default_Flag
0    84.425467
1    15.574533
Name: proportion, dtype: float64


-------Explore categorical variable distributions-------------

In [67]:
categorical_vars = data.select_dtypes(include=['object']).columns
for col in categorical_vars:
    print(f"\nVariable: {col}")
    print(data[col].value_counts(normalize=True) * 100)


Variable: Income_group
Income_group
<=100000     39.448185
> 100000     30.191055
< = 50000    13.012536
< = 35000     7.527235
< = 25000     6.574698
< = 15000     3.246291
Name: proportion, dtype: float64

Variable: Occupation_type
Occupation_type
Private          57.612087
Govt             36.634368
Self-Employed     5.288225
Self-employed     0.443423
Unemployed        0.021897
Name: proportion, dtype: float64

Variable: Marital_Status
Marital_Status
Married      51.858543
Single       48.048393
Seperated     0.093064
Name: proportion, dtype: float64

Variable: Gender
Gender
F    50.588493
M    49.411507
Name: proportion, dtype: float64

Variable: Debt_Burden_Ratio
Debt_Burden_Ratio
< = 0.10    59.172278
< = 0.50    24.579843
> 0.50      16.247879
Name: proportion, dtype: float64

Variable: Location
Location
North      35.024908
South      32.840642
Central    32.134450
Name: proportion, dtype: float64

Variable: DPD_30_6M
DPD_30_6M
0    70.953085
1    28.723928
2     0.218974
.  

In [11]:
for col in data.columns:
    # Count how many '.' values are in the column
    dot_count = (data[col] == ".").sum()
    if dot_count > 0:
        print(f"Column: {col} --> {dot_count} occurrences of '.'")

Column: DPD_30_6M --> 19 occurrences of '.'
Column: DPD_60_6M --> 19 occurrences of '.'


In [13]:
for col in data.columns:
    if (data[col] == ".").any():  # only check columns that contain "."
        print(f"\nUnique values in {col}:")
        print(data[col].value_counts(dropna=False))


Unique values in DPD_30_6M:
DPD_30_6M
0    12961
1     5247
2       40
.       19
Name: count, dtype: int64

Unique values in DPD_60_6M:
DPD_60_6M
0    13861
1     4343
2       44
.       19
Name: count, dtype: int64


In [15]:
# Replace all occurrences of "." with NaN
data.replace(".", np.nan, inplace=True)

In [75]:
# Summary of missing values per column
missing_summary = data.isnull().sum()
print("Missing values per variable:")
print(missing_summary)

Missing values per variable:
Cust_id                      0
Age                          0
Income_group                 0
Existing_Customer            0
Occupation_type              0
Education_Qualification      0
Dependents                   0
Marital_Status               0
Gender                       0
Debt_Burden_Ratio            0
Years_at_current_address     0
Credit_Amount                0
Housing_1_Own_2_Rent         0
Employed_since_yrs           0
Location                     0
DPD_30_6M                   19
DPD_60_6M                   19
Months_on_book               0
Default_Flag                 0
Occupation_Partner           0
Job_Seniority                0
Assets                       0
Savings                      0
dtype: int64


In [19]:
df_clean = data.dropna()

In [81]:
print("Any '.' left? ", (df_clean == ".").any().any())
print("Any NaN left? ", df_clean.isnull().any().any())

Any '.' left?  False
Any NaN left?  False


In [21]:
df_clean

Unnamed: 0,Cust_id,Age,Income_group,Existing_Customer,Occupation_type,Education_Qualification,Dependents,Marital_Status,Gender,Debt_Burden_Ratio,...,Employed_since_yrs,Location,DPD_30_6M,DPD_60_6M,Months_on_book,Default_Flag,Occupation_Partner,Job_Seniority,Assets,Savings
0,14898,52,<=100000,0,Private,1,1,Single,M,< = 0.10,...,14,North,0,0,> = 6,0,freelance,20,10000,1.600000
1,34986,34,> 100000,0,Govt,1,1,Married,F,< = 0.10,...,11,Central,0,0,> = 6,0,freelance,5,1,7.662857
2,74167,37,<=100000,0,Govt,2,0,Single,M,< = 0.10,...,10,North,0,0,< 6,0,fixed,1,7700,6.624000
3,80278,43,< = 50000,0,Private,4,0,Single,M,< = 0.50,...,6,South,1,1,< 6,1,fixed,2,55,8.307692
4,2442,35,< = 50000,0,Private,3,1,Married,F,< = 0.10,...,8,South,0,0,> = 6,0,partime,0,0,0.675000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18262,35788,40,<=100000,0,Private,3,1,Single,M,< = 0.10,...,7,Central,0,0,> = 6,0,freelance,3,6000,2.589474
18263,85264,42,< = 50000,0,Govt,1,2,Married,F,< = 0.10,...,6,South,0,0,> = 6,0,fixed,14,4000,2.742857
18264,1481,43,<=100000,0,Private,1,0,Single,F,< = 0.50,...,7,North,1,1,< 6,1,freelance,0,10000,7.140000
18265,25731,37,< = 50000,0,Govt,4,1,Single,M,< = 0.50,...,9,Central,0,0,> = 6,0,fixed,2,2000,1.158621


In [23]:
import pandas as pd
import numpy as np

EPS = 1e-6  # smoothing factor to avoid divide-by-zero

def compute_woe_iv_all(df, target='Default_Flag', bins=10, print_all=True):
    """
    Compute WOE and IV for all variables in a dataset.
    Handles both numeric and categorical variables correctly.
    """

    y = df[target].astype(int)  # target must be binary 0/1

    results = {}
    iv_rows = []

    for col in df.columns:
        if col == target:
            continue

        x = df[col]

        # --- Step 1: Decide whether variable is numeric or categorical ---
        if pd.api.types.is_numeric_dtype(x) and x.nunique() > bins:
            # Numeric → apply binning
            try:
                binned_x = pd.qcut(x, q=bins, duplicates='drop')
            except Exception:
                # fallback to treating as categorical if qcut fails
                binned_x = x.astype(str)
        else:
            # Categorical or low-cardinality numeric → treat as category
            binned_x = x.astype(str)

        # --- Step 2: Build grouped stats ---
        d = pd.DataFrame({'Bin': binned_x, 'Y': y})
        g = d.groupby('Bin')['Y']
        total = g.size()
        events = g.sum()
        non_events = total - events

        # Skip if no separation
        if events.sum() == 0 or non_events.sum() == 0:
            print(f"Skipping {col}: no events or non-events across its groups.")
            continue

        woe_df = pd.DataFrame({
            'Total': total,
            'Events': events,
            'Non_Events': non_events
        })

        woe_df['Dist_Event'] = woe_df['Events'] / woe_df['Events'].sum()
        woe_df['Dist_NonEvent'] = woe_df['Non_Events'] / woe_df['Non_Events'].sum()
        woe_df['WOE'] = np.log((woe_df['Dist_NonEvent'] + EPS) / (woe_df['Dist_Event'] + EPS))
        woe_df['IV'] = (woe_df['Dist_NonEvent'] - woe_df['Dist_Event']) * woe_df['WOE']

        iv_total = woe_df['IV'].sum()

        results[col] = woe_df.reset_index().rename(columns={'Bin': col})
        iv_rows.append((col, iv_total))

        if print_all:
            print(f"\n===== {col} =====")
            print(results[col])
            print(f"Total IV for {col}: {iv_total:.4f}")

    # --- Step 3: IV summary table ---
    iv_summary = pd.DataFrame(iv_rows, columns=['Variable', 'IV']).sort_values('IV', ascending=False)

    def interpret(iv):
        if iv < 0.02: return 'Not predictive'
        if iv < 0.10: return 'Weak'
        if iv < 0.30: return 'Medium'
        if iv < 0.50: return 'Strong'
        return 'Suspicious (too predictive)'

    iv_summary['Interpretation'] = iv_summary['IV'].apply(interpret)

    print("\n\n=== IV Summary (sorted) ===")
    print(iv_summary)

    return results, iv_summary


# ---- Run on your dataset ----
woe_tables, iv_summary = compute_woe_iv_all(df_clean, target='Default_Flag', bins=10, print_all=True)


  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']
  g = d.groupby('Bin')['Y']



===== Cust_id =====
              Cust_id  Total  Events  Non_Events  Dist_Event  Dist_NonEvent  \
0     (2.999, 9980.5]   1825     268        1557    0.094300       0.101065   
1   (9980.5, 19805.2]   1825     255        1570    0.089726       0.101908   
2  (19805.2, 29716.3]   1825     280        1545    0.098522       0.100286   
3  (29716.3, 39652.4]   1824     287        1537    0.100985       0.099766   
4  (39652.4, 50064.5]   1825     270        1555    0.095004       0.100935   
5  (50064.5, 59763.4]   1825     285        1540    0.100281       0.099961   
6  (59763.4, 69813.9]   1824     299        1525    0.105208       0.098987   
7  (69813.9, 79888.0]   1825     303        1522    0.106615       0.098793   
8  (79888.0, 90226.0]   1825     295        1530    0.103800       0.099312   
9  (90226.0, 99994.0]   1825     300        1525    0.105559       0.098987   

        WOE        IV  
0  0.069279  0.000469  
1  0.127317  0.001551  
2  0.017740  0.000031  
3 -0.012143  

In [70]:
import pandas as pd
import numpy as np

# target variable
target = 'Default_Flag'

# list of coarse classed variables
coarse_vars = [
    'Age_bin', 'Income_bin', 'Occ_bin', 'Marital_bin',
    'Housing_bin', 'EmpTenure_bin', 'OccPartner_bin'
]

# function to calculate WOE/IV for one variable
def calc_woe_iv(df, feature, target):
    # group by feature
    grouped = df.groupby(feature)[target].agg(['count','sum'])
    grouped.rename(columns={'count': 'Total', 'sum': 'Events'}, inplace=True)
    grouped['Non_Events'] = grouped['Total'] - grouped['Events']
    
    # distribution of events and non-events
    grouped['Dist_Event'] = grouped['Events'] / grouped['Events'].sum()
    grouped['Dist_NonEvent'] = grouped['Non_Events'] / grouped['Non_Events'].sum()
    
    # handle divide by zero with small epsilon
    eps = 1e-10
    grouped['WOE'] = np.log((grouped['Dist_Event'] + eps) / (grouped['Dist_NonEvent'] + eps))
    
    # IV calculation
    grouped['IV'] = (grouped['Dist_Event'] - grouped['Dist_NonEvent']) * grouped['WOE']
    grouped['IV'] = grouped['IV'].round(6)
    
    # total IV
    total_iv = grouped['IV'].sum().round(4)
    
    return grouped.reset_index(), total_iv

# run the function for all coarse variables
iv_summary = []

for var in coarse_vars:
    table, total_iv = calc_woe_iv(df_clean, var, target)
    print(f"\n===============================")
    print(f"WOE/IV Table for {var}")
    print(table)
    print(f"Total IV for {var}: {total_iv}")
    iv_summary.append({'Variable': var, 'IV': total_iv})

# IV summary table
iv_summary_df = pd.DataFrame(iv_summary)
print("\n\n===============================")
print(" IV Summary for All Coarse Variables")
print(iv_summary_df)



WOE/IV Table for Age_bin
               Age_bin  Total  Events  Non_Events  Dist_Event  Dist_NonEvent  \
0      High Risk (<25)    686     200         486    0.070373       0.031546   
1       Low Risk (>40)   9995    1106        8889    0.389163       0.576983   
2  Medium Risk (25-40)   7567    1536        6031    0.540464       0.391471   

        WOE        IV  
0  0.802358  0.031153  
1 -0.393816  0.073967  
2  0.322518  0.048053  
Total IV for Age_bin: 0.1532

WOE/IV Table for Income_bin
               Income_bin  Total  Events  Non_Events  Dist_Event  \
0       High Risk (<=35k)   3167     737        2430    0.259324   
1        Low Risk (>100k)   5504     382        5122    0.134412   
2  Medium Risk (35k-100k)   9577    1723        7854    0.606263   

   Dist_NonEvent       WOE        IV  
0       0.157731  0.497190  0.050511  
1       0.332468 -0.905631  0.179365  
2       0.509801  0.173293  0.016716  
Total IV for Income_bin: 0.2466

WOE/IV Table for Occ_bin
            

In [68]:
import numpy as np
import pandas as pd

# ======================
# 1. Age
# ======================
df_clean['Age_bin'] = np.select(
    [
        df_clean['Age'] < 25,
        (df_clean['Age'] >= 25) & (df_clean['Age'] <= 40),
        df_clean['Age'] > 40
    ],
    ['High Risk (<25)', 'Medium Risk (25-40)', 'Low Risk (>40)'],
    default='Other'
)

# ======================
# 2. Income_group
# ======================
def bin_income(x):
    x = str(x).replace(" ", "").lower()
    if x in ['<=15000', '<=25000', '<=35000']:
        return 'High Risk (<=35k)'
    elif x in ['<=50000', '<=100000']:
        return 'Medium Risk (35k-100k)'
    elif x in ['>100000', '>100k']:
        return 'Low Risk (>100k)'
    else:
        return 'Other'

df_clean['Income_bin'] = df_clean['Income_group'].apply(bin_income)

# ======================
# 3. Occupation_type
# ======================
def bin_occ(x):
    x = str(x).strip().lower()
    if x in ['unemployed']:
        return 'High Risk (Unemployed)'
    elif x in ['private']:
        return 'Medium Risk (Private)'
    elif x in ['govt', 'self-employed', 'self employed', 'self_employed']:
        return 'Low Risk (Govt/Self-Employed)'
    else:
        return 'Other'

df_clean['Occ_bin'] = df_clean['Occupation_type'].apply(bin_occ)

# ======================
# 4. Marital_Status
# ======================
def bin_marital(x):
    x = str(x).strip().lower()
    if x == 'single':
        return 'High Risk (Single)'
    elif x in ['married', 'separated', 'seperated']:
        return 'Low Risk (Married/Separated)'
    else:
        return 'Other'

df_clean['Marital_bin'] = df_clean['Marital_Status'].apply(bin_marital)

# ======================
# 5. Housing_1_Own_2_Rent
# ======================
df_clean['Housing_bin'] = df_clean['Housing_1_Own_2_Rent'].map({
    1: 'Low Risk (Own)',
    2: 'High Risk (Rent)'
})

# ======================
# 6. Employed_since_yrs
# ======================
df_clean['EmpTenure_bin'] = np.select(
    [
        df_clean['Employed_since_yrs'] <= 3,
        (df_clean['Employed_since_yrs'] > 3) & (df_clean['Employed_since_yrs'] <= 10),
        df_clean['Employed_since_yrs'] > 10
    ],
    ['High Risk (<=3 yrs)', 'Medium Risk (4-10 yrs)', 'Low Risk (>10 yrs)'],
    default='Other'
)

# ======================
# 7. Occupation_Partner
# ======================
def bin_occ_partner(x):
    x = str(x).strip().lower()
    if x in ['freelance', 'others']:
        return 'High Risk (Freelance/Others)'
    elif x == 'partime':
        return 'Medium Risk (Part-time)'
    elif x == 'fixed':
        return 'Low Risk (Fixed)'
    else:
        return 'Other'

df_clean['OccPartner_bin'] = df_clean['Occupation_Partner'].apply(bin_occ_partner)

# ======================
# Final check
# ======================
print(df_clean[['Age_bin','Income_bin','Occ_bin','Marital_bin',
                'Housing_bin','EmpTenure_bin','OccPartner_bin']].head(15))


                Age_bin              Income_bin  \
0        Low Risk (>40)  Medium Risk (35k-100k)   
1   Medium Risk (25-40)        Low Risk (>100k)   
2   Medium Risk (25-40)  Medium Risk (35k-100k)   
3        Low Risk (>40)  Medium Risk (35k-100k)   
4   Medium Risk (25-40)  Medium Risk (35k-100k)   
5        Low Risk (>40)        Low Risk (>100k)   
6   Medium Risk (25-40)  Medium Risk (35k-100k)   
7        Low Risk (>40)        Low Risk (>100k)   
8        Low Risk (>40)        Low Risk (>100k)   
9        Low Risk (>40)  Medium Risk (35k-100k)   
10       Low Risk (>40)  Medium Risk (35k-100k)   
11       Low Risk (>40)        Low Risk (>100k)   
12  Medium Risk (25-40)  Medium Risk (35k-100k)   
13  Medium Risk (25-40)  Medium Risk (35k-100k)   
14       Low Risk (>40)  Medium Risk (35k-100k)   

                          Occ_bin                   Marital_bin  \
0           Medium Risk (Private)            High Risk (Single)   
1   Low Risk (Govt/Self-Employed)  Low Risk (Marr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Age_bin'] = np.select(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Income_bin'] = df_clean['Income_group'].apply(bin_income)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Occ_bin'] = df_clean['Occupation_type'].apply(bin_occ)
A value is trying to be set on a copy o

In [60]:
!pip install statsmodels




In [72]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -----------------------------
# Function to calculate WOE
# -----------------------------
def calc_woe_iv(df, feature, target):
    temp = df.groupby(feature)[target].agg(['sum','count'])
    temp['non_event'] = temp['count'] - temp['sum']
    temp['event_dist'] = temp['sum'] / temp['sum'].sum()
    temp['non_event_dist'] = temp['non_event'] / temp['non_event'].sum()
    temp['WOE'] = np.log(temp['event_dist'] / temp['non_event_dist']).replace([np.inf, -np.inf], 0)
    mapping = temp['WOE'].to_dict()
    return mapping

# -----------------------------
# Apply WOE on all 7 coarse variables
# -----------------------------
coarse_vars = ['Age_bin', 'Income_bin', 'Occ_bin', 'Marital_bin',
               'Housing_bin', 'EmpTenure_bin', 'OccPartner_bin']

df_woe = df_clean.copy()

for var in coarse_vars:
    woe_map = calc_woe_iv(df_clean, var, 'Default_Flag')
    df_woe[var + '_WOE'] = df_clean[var].map(woe_map)

# -----------------------------
# Calculate VIF using WOE variables
# -----------------------------
woe_cols = [v + '_WOE' for v in coarse_vars]
X = df_woe[woe_cols]

vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


              feature       VIF
0         Age_bin_WOE  1.792508
1      Income_bin_WOE  1.606338
2         Occ_bin_WOE  1.035952
3     Marital_bin_WOE  1.007954
4     Housing_bin_WOE  1.007942
5   EmpTenure_bin_WOE  2.111732
6  OccPartner_bin_WOE  1.011420


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
import statsmodels.api as sm

# --------------------------------------
# 1. Define feature columns and target
# --------------------------------------
woe_features = [
    'Age_bin_WOE',
    'Income_bin_WOE',
    'Occ_bin_WOE',
    'Marital_bin_WOE',
    'Housing_bin_WOE',
    'OccPartner_bin_WOE'
]

X_train_model = train_df[woe_features]     # training dataset (already split)
y_train = train_df['Default_Flag']

# --------------------------------------
# 2. Add intercept (constant) term
# --------------------------------------
X_train_const = sm.add_constant(X_train_model)

# --------------------------------------
# 3. Build Logistic Regression model
# --------------------------------------
logit_model = sm.Logit(y_train, X_train_const)
result = logit_model.fit()

# --------------------------------------
# 4. View model summary
# --------------------------------------
print(result.summary())


In [None]:
df_woe

In [None]:
vif_data

In [None]:
import pandas as pd

# full WOE dataset you already created earlier
# df_woe has all 7 WOE variables
# remove EmpTenure_bin_WOE and keep only the 6 selected variables

final_woe_vars = [
    'Age_bin_WOE',
    'Income_bin_WOE',
    'Occ_bin_WOE',
    'Marital_bin_WOE',
    'Housing_bin_WOE',
    'OccPartner_bin_WOE'
]

# create final dataset
df_model = df_woe[final_woe_vars + ['Default_Flag']].copy()

# quick check
print(df_model.head())


In [None]:
import pandas as pd
import statsmodels.api as sm

# ---------------------------------------
# Step 1: Select final WOE variables
# ---------------------------------------
woe_vars = [
    'Age_bin_WOE', 'Income_bin_WOE', 'Occ_bin_WOE',
    'Marital_bin_WOE', 'Housing_bin_WOE', 'OccPartner_bin_WOE'
]

# Create modeling dataset with only predictors + target
df_model = df_woe[woe_vars + ['Default_Flag']].copy()

print("Shape of modeling dataset:", df_model.shape)
print(df_model.head())

# ---------------------------------------
# Step 2: Define X (features) and y (target)
# ---------------------------------------
X = df_model[woe_vars]
y = df_model['Default_Flag']

# Add constant for intercept
X = sm.add_constant(X)

# ---------------------------------------
# Step 3: Build logistic regression model
# ---------------------------------------
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# ---------------------------------------
# Step 4: View model summary
# ---------------------------------------
print(result.summary())


In [102]:
import numpy as np
import pandas as pd

# Step 1: Get predicted probabilities from the trained model
df_model['pred_prob'] = result.predict(sm.add_constant(df_model[woe_vars]))

# Step 2: Separate the default and non-default groups
default = df_model[df_model['Default_Flag'] == 1]['pred_prob']
nondefault = df_model[df_model['Default_Flag'] == 0]['pred_prob']

# Step 3: Calculate concordance, discordance, and ties
concordant = 0
discordant = 0
ties = 0

for i in default:
    for j in nondefault:
        if i > j:
            concordant += 1
        elif i < j:
            discordant += 1
        else:
            ties += 1

# Step 4: Total pairs and percentages
total_pairs = concordant + discordant + ties

concordance_perc = round((concordant / total_pairs) * 100, 2)
discordance_perc = round((discordant / total_pairs) * 100, 2)
ties_perc = round((ties / total_pairs) * 100, 2)

# Step 5: Print the results
print("Concordant Pairs   :", concordant)
print("Discordant Pairs   :", discordant)
print("Tied Pairs         :", ties)
print("Total Pairs        :", total_pairs)

print("\nConcordance (%)    :", concordance_perc)
print("Discordance (%)    :", discordance_perc)
print("Ties (%)           :", ties_perc)


Concordant Pairs   : 33394855
Discordant Pairs   : 9958988
Tied Pairs         : 430009
Total Pairs        : 43783852

Concordance (%)    : 76.27
Discordance (%)    : 22.75
Ties (%)           : 0.98


In [118]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm

# -----------------------------------
# Step 1: Prepare features for prediction
# -----------------------------------
X = df_model[['Age_bin_WOE','Income_bin_WOE','Occ_bin_WOE',
              'Marital_bin_WOE','Housing_bin_WOE','OccPartner_bin_WOE']]

# Add intercept/constant (since model was trained with it)
X = sm.add_constant(X)

# -----------------------------------
# Step 2: Predict probabilities from logistic regression model
# -----------------------------------
df_model['pred_prob'] = result.predict(X)

# -----------------------------------
# Step 3: Create decile groups of predicted probabilities
# -----------------------------------
df_model['decile'] = pd.qcut(df_model['pred_prob'], 10, duplicates='drop')

# -----------------------------------
# Step 4: Group by deciles and calculate observed vs expected
# -----------------------------------
hl_table = (
    df_model.groupby('decile', observed=False)
    .apply(lambda d: pd.Series({
        'total': len(d),
        'observed_1': d['Default_Flag'].sum(),  # actual defaults
        'observed_0': (1 - d['Default_Flag']).sum(),  # actual non-defaults
        'expected_1': d['pred_prob'].sum(),  # expected defaults
        'expected_0': (1 - d['pred_prob']).sum()  # expected non-defaults
    }), include_groups=False)
    .reset_index()
)

# -----------------------------------
# Step 5: Hosmer–Lemeshow Chi-Square statistic
# -----------------------------------
hl_table['chi_1'] = (hl_table['observed_1'] - hl_table['expected_1'])**2 / hl_table['expected_1']
hl_table['chi_0'] = (hl_table['observed_0'] - hl_table['expected_0'])**2 / hl_table['expected_0']

HL_stat = hl_table['chi_1'].sum() + hl_table['chi_0'].sum()
df = hl_table.shape[0] - 2  # degrees of freedom (g-2 where g = groups = 10)
p_value = 1 - stats.chi2.cdf(HL_stat, df)

# -----------------------------------
# Step 6: Print the results
# -----------------------------------
print("Hosmer–Lemeshow Test Results")
print("----------------------------")
print(hl_table[['decile','observed_1','expected_1','observed_0','expected_0']])
print("\nHL Statistic :", round(HL_stat, 2))
print("Degrees of freedom:", df)
print("p-value :", round(p_value, 4))


Hosmer–Lemeshow Test Results
----------------------------
                            decile  observed_1  expected_1  observed_0  \
0  (0.0057799999999999995, 0.0299]        26.0   32.784525      1801.0   
1                 (0.0299, 0.0474]        68.0   75.456769      1950.0   
2                 (0.0474, 0.0664]       108.0   94.363007      1590.0   
3                  (0.0664, 0.089]       149.0  131.530118      1620.0   
4                   (0.089, 0.108]       187.0  188.214559      1694.0   
5                   (0.108, 0.142]       252.0  260.906381      1726.0   
6                   (0.142, 0.185]       287.0  285.615970      1400.0   
7                   (0.185, 0.259]       415.0  413.089759      1438.0   
8                   (0.259, 0.349]       500.0  522.214774      1230.0   
9                   (0.349, 0.615]       850.0  837.824137       957.0   

    expected_0  
0  1794.215475  
1  1942.543231  
2  1603.636993  
3  1637.469882  
4  1692.785441  
5  1717.093619  
6  1401.

In [122]:
import pandas as pd
import statsmodels.api as sm

# -----------------------------
# Step 1: Generate predicted probabilities (scores)
# -----------------------------
df_model['score'] = result.predict(sm.add_constant(df_model[[
    'Age_bin_WOE', 'Income_bin_WOE', 'Occ_bin_WOE',
    'Marital_bin_WOE', 'Housing_bin_WOE', 'OccPartner_bin_WOE'
]]))

# -----------------------------
# Step 2: Round scores to 3 decimals 
# (avoids too many unique values)
# -----------------------------
df_model['score_rounded'] = df_model['score'].round(3)

# -----------------------------
# Step 3: Frequency distribution of rounded scores
# -----------------------------
clustering_check = (
    df_model.groupby('score_rounded')
    .size()
    .reset_index(name='count')
)

# -----------------------------
# Step 4: Add percentage of total customers
# -----------------------------
clustering_check['perc'] = round(
    (clustering_check['count'] / clustering_check['count'].sum()) * 100, 2
)

# -----------------------------
# Step 5: Sort by frequency 
# (to detect concentrations)
# -----------------------------
clustering_check = clustering_check.sort_values(by='count', ascending=False)

print("\n Clustering Check – Top 10 Most Frequent Score Points:\n")
print(clustering_check.head(10))



 Clustering Check – Top 10 Most Frequent Score Points:

     score_rounded  count  perc
80           0.142    526  2.88
18           0.036    511  2.80
30           0.052    450  2.47
41           0.071    398  2.18
40           0.070    388  2.13
61           0.106    369  2.02
63           0.108    361  1.98
126          0.331    345  1.89
93           0.181    339  1.86
4            0.015    338  1.85



Brier Score Calculation


In [None]:
!pip install scikit-learn


In [130]:
from sklearn.metrics import brier_score_loss

# Step 1: Actual default flags
y_true = df_model['Default_Flag']

# Step 2: Predicted probabilities
y_pred_prob = df_model['pred_prob']

# Step 3: Compute Brier Score
brier_score = brier_score_loss(y_true, y_pred_prob)

print("Brier Score:", round(brier_score, 4))


Brier Score: 0.1136


In [None]:
# ------------------------------------------------
# Step 1: Import Brier Score function
# ------------------------------------------------
from sklearn.metrics import brier_score_loss

# ------------------------------------------------
# Step 2: Actual default flags
# These are the true labels from our dataset:
# 1 = Defaulted, 0 = Did not default
# ------------------------------------------------
y_true = df_model['Default_Flag']

# ------------------------------------------------
# Step 3: Predicted probabilities from our model
# These are the model’s outputs (scores) that
# represent the estimated probability of default
# ------------------------------------------------
y_pred_prob = df_model['pred_prob']

# ------------------------------------------------
# Step 4: Compute the Brier Score
# The formula is: mean((y_true - y_pred_prob)^2)
# It measures how close the predicted probabilities
# are to the actual outcomes.
# ------------------------------------------------
brier_score = brier_score_loss(y_true, y_pred_prob)

# ------------------------------------------------
# Step 5: Print result
# A lower score indicates better accuracy.
# ------------------------------------------------
print("Brier Score:", round(brier_score, 4))


In [134]:
import numpy as np
from sklearn.metrics import roc_auc_score

# Step 1: Define Actual Defaults and Predicted Probabilities
y_true = df_model['Default_Flag']       # 1 = Default, 0 = Non-default
y_pred_prob = df_model['pred_prob']     # Predicted probability of default from model

# Step 2: Compute C-statistic (AUC)
c_statistic = roc_auc_score(y_true, y_pred_prob)

# Step 3: Compute Somer's D
somers_d = 2 * c_statistic - 1

# Step 4: Print Results
print("C-Statistic (AUC):", round(c_statistic, 4))
print("Somer's D:", round(somers_d, 4))


C-Statistic (AUC): 0.7676
Somer's D: 0.5353


In [138]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

# Step 1: Actual outcomes (default flag)
y_true = df_model['Default_Flag']

# Step 2: Predicted probabilities from the model
y_pred_prob = df_model['pred_prob']

# Step 3: Apply cutoff threshold (0.5)
y_pred_class = (y_pred_prob >= 0.5).astype(int)

# Step 4: Calculate Accuracy
accuracy = accuracy_score(y_true, y_pred_class)

# Step 5: Calculate Precision
precision = precision_score(y_true, y_pred_class)

# Step 6: Confusion Matrix for detailed breakdown
cm = confusion_matrix(y_true, y_pred_class)

# Display results
print("Model Accuracy :", round(accuracy, 4))
print("Model Precision:", round(precision, 4))
print("\nConfusion Matrix:\n", cm)


Model Accuracy : 0.8467
Model Precision: 0.5321

Confusion Matrix:
 [[15085   321]
 [ 2477   365]]


In [142]:
import pandas as pd

# Step 1: Take actual default flag and predicted probabilities
df_rank = df_model[['Default_Flag', 'pred_prob']].copy()

# Step 2: Create deciles (10 groups based on predicted probabilities)
df_rank['decile'] = pd.qcut(df_rank['pred_prob'], 10, labels=False, duplicates='drop') + 1

# Step 3: Group by decile and calculate metrics
rank_table = df_rank.groupby('decile').agg(
    total_customers = ('Default_Flag', 'count'),
    mean_score      = ('pred_prob', 'mean'),
    bad_rate        = ('Default_Flag', 'mean')
).reset_index()

# Step 4: Sort by decile (highest score group first)
rank_table = rank_table.sort_values(by='decile', ascending=False).reset_index(drop=True)

# Step 5: Print results
print("\n📊 Rank Ordering Table (by Deciles):\n")
print(rank_table)



📊 Rank Ordering Table (by Deciles):

   decile  total_customers  mean_score  bad_rate
0      10             1807    0.463655  0.470393
1       9             1730    0.301858  0.289017
2       8             1853    0.222930  0.223961
3       7             1687    0.169304  0.170124
4       6             1978    0.131904  0.127401
5       5             1881    0.100061  0.099415
6       4             1769    0.074353  0.084228
7       3             1698    0.055573  0.063604
8       2             2018    0.037392  0.033697
9       1             1827    0.017944  0.014231


In [152]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve

# KS Statistic
def ks_statistic(y_true, y_pred_prob):
    data = pd.DataFrame({"y_true": y_true, "y_pred_prob": y_pred_prob})
    data = data.sort_values(by="y_pred_prob", ascending=False)

    # Cumulative bad and good rates
    data["cum_bad"] = (data["y_true"] == 1).cumsum() / (data["y_true"] == 1).sum()
    data["cum_good"] = (data["y_true"] == 0).cumsum() / (data["y_true"] == 0).sum()

    data["ks"] = np.abs(data["cum_bad"] - data["cum_good"])
    ks_value = data["ks"].max()
    ks_at = data.loc[data["ks"].idxmax(), "y_pred_prob"]

    return ks_value, ks_at, data

ks_value, ks_cutoff, ks_table = ks_statistic(y_true, y_pred_prob)
print(f"KS Statistic: {ks_value:.4f} at cutoff {ks_cutoff:.4f}")


KS Statistic: 0.4021 at cutoff 0.1583


In [150]:
# Gini Coefficient
auc = roc_auc_score(y_true, y_pred_prob)
gini = 2 * auc - 1
print(f"AUC: {auc:.4f}")
print(f"Gini Coefficient: {gini:.4f}")


AUC: 0.7676
Gini Coefficient: 0.5353
