In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
df=pd.read_csv('car.csv',low_memory=False)
print(f"Shape: {df.shape}")

Shape: (8863, 74)


In [3]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077430,1314167,2500,2500,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
1,1072053,1288686,3000,3000,3000.0,36 months,18.64,109.43,E,E1,...,,,,,,,,,,
2,1069243,1304116,12000,12000,12000.0,36 months,15.96,421.65,C,C5,...,,,,,,,,,,
3,1041756,1272024,4500,4500,4475.0,60 months,10.65,97.06,B,B2,...,,,,,,,,,,
4,1068350,1302971,3500,3500,3500.0,36 months,6.03,106.53,A,A1,...,,,,,,,,,,


In [4]:
# unnecessary_columns = ['id','member_id','open_acc_6m','open_il_6m','open_il_12m','open_il_24m','mths_since_rcnt_il','total_bal_il',
#                        'il_util','open_rv_12m','open_rv_24m','max_bal_bc','all_util','total_rev_hi_lim','inq_fi','total_cu_tl','inq_last_12m',
#                        'url','desc','annual_inc_joint','dti_joint','verification_status_joint','mths_since_last_major_derog','next_pymnt_d',
#                        'mths_since_last_delinq','mths_since_last_record','tot_coll_amt','tot_cur_bal'
#                       ] 
# for col in unnecessary_columns:
#     if col in df.columns:
#         df = df.drop(columns=[col])

drop_cols = ['id', 'member_id', 'url', 'desc', 'next_pymnt_d',
             'annual_inc_joint', 'dti_joint', 'verification_status_joint',
             'mths_since_last_major_derog', 'mths_since_last_delinq',
             'mths_since_last_record', 'tot_coll_amt', 'tot_cur_bal']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

In [5]:
closed_statuses = [
    'Fully Paid',
    'Charged Off',
    'Default',
    'Does not meet the credit policy. Status:Fully Paid',
    'Does not meet the credit policy. Status:Charged Off'
]
active_statuses = [
    'Current',
    'In Grace Period',
    'Late (16-30 days)',
    'Late (31-120 days)',
    'Issued'
]
df['loan_state'] = df['loan_status'].apply(
    lambda x: 'Closed' if x in closed_statuses else ('Active' if x in active_statuses else 'Unknown')
)
df = df[df['loan_state'] != 'Unknown']


In [6]:
train_df = df[df['loan_state'] == 'Closed'].copy()
test_df = df[df['loan_state'] == 'Active'].copy()

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

Train size: (3720, 62)
Test size: (5143, 62)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Data columns (total 62 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   8863 non-null   int64  
 1   funded_amnt                 8863 non-null   int64  
 2   funded_amnt_inv             8863 non-null   float64
 3   term                        8863 non-null   object 
 4   int_rate                    8863 non-null   float64
 5   installment                 8863 non-null   float64
 6   grade                       8863 non-null   object 
 7   sub_grade                   8863 non-null   object 
 8   emp_title                   8349 non-null   object 
 9   emp_length                  8445 non-null   object 
 10  home_ownership              8863 non-null   object 
 11  annual_inc                  8863 non-null   float64
 12  verification_status         8863 non-null   object 
 13  issue_d                     8863 

In [8]:
print(df.isnull().sum())

loan_amnt              0
funded_amnt            0
funded_amnt_inv        0
term                   0
int_rate               0
                    ... 
total_rev_hi_lim    2197
inq_fi              8664
total_cu_tl         8664
inq_last_12m        8664
loan_state             0
Length: 62, dtype: int64


In [9]:
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype in ['int64', 'float64']:
            median_val = df[col].median()
            df[col]=df[col].fillna(median_val)
            print(f"Imputed numerical column '{col}' with median: {median_val}")
        else:
            mode_val = df[col].mode()[0]
            df[col]=df[col].fillna(mode_val)
            print(f"Imputed categorical column '{col}' with mode: {mode_val}")

Imputed categorical column 'emp_title' with mode: Teacher
Imputed categorical column 'emp_length' with mode: 10+ years
Imputed categorical column 'title' with mode: Car financing
Imputed numerical column 'revol_util' with median: 40.6
Imputed categorical column 'last_pymnt_d' with mode: Jan-16
Imputed categorical column 'last_credit_pull_d' with mode: Jan-16
Imputed numerical column 'collections_12_mths_ex_med' with median: 0.0
Imputed numerical column 'open_acc_6m' with median: 1.0
Imputed numerical column 'open_il_6m' with median: 2.0
Imputed numerical column 'open_il_12m' with median: 0.0
Imputed numerical column 'open_il_24m' with median: 1.0
Imputed numerical column 'mths_since_rcnt_il' with median: 13.0
Imputed numerical column 'total_bal_il' with median: 18031.0
Imputed numerical column 'il_util' with median: 75.15
Imputed numerical column 'open_rv_12m' with median: 1.0
Imputed numerical column 'open_rv_24m' with median: 2.0
Imputed numerical column 'max_bal_bc' with median: 292

In [10]:
closed_mapping = {
    'Charged Off': 1,
    'Default': 1,
    'Does not meet the credit policy. Status:Charged Off': 1,
    'Fully Paid': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0
}
# df = df[df['loan_status'].isin(status_map.keys())].copy()
# df['loan_status'] = df['loan_status'].map(status_map)

train_df['loan_status'] = train_df['loan_status'].map(closed_mapping)
test_df['loan_status'] = np.nan  

# print(train_df['loan_status'].value_counts())

print("\nUnique values in 'loan_status' after conversion:", train_df['loan_status'].unique())
print("Data type of 'loan_status':", train_df['loan_status'].dtype)
print(train_df['loan_status'].value_counts(normalize=True))


Unique values in 'loan_status' after conversion: [1 0]
Data type of 'loan_status': int64
loan_status
0    0.873387
1    0.126613
Name: proportion, dtype: float64


In [11]:
def calculate_woe_iv(df, feature, target):
    """
    Calculates WoE and IV for a given feature against a binary target.

    Args:
        df (pd.DataFrame): The input DataFrame.
        feature (str): The name of the feature column.
        target (str): The name of the binary target column (0 and 1).

    Returns:
        tuple: A tuple containing:
            - woe_df (pd.DataFrame): DataFrame with 'Feature', 'Category', 'Good', 'Bad',
                                     'Good_Pct', 'Bad_Pct', 'WoE' columns.
            - iv (float): The Information Value for the feature.
    """
    # Ensure target is 0 and 1
    if not all(train_df[target].isin([0, 1])):
        raise ValueError(f"Target column '{target}' must contain only 0s and 1s.")

    grouped = train_df.groupby(feature)[target].agg(['count', 'sum'])
    grouped.columns = ['Total', 'Bad'] 
    grouped['Good'] = grouped['Total'] - grouped['Bad']

    
    total_good = grouped['Good'].sum()
    total_bad = grouped['Bad'].sum()

    if total_good == 0 or total_bad == 0:
        print(f"Warning: Total good or bad cases are zero for feature '{feature}'. IV will be 0.")
        return pd.DataFrame(), 0.0

    grouped['Good_Pct'] = grouped['Good'] / total_good
    grouped['Bad_Pct'] = grouped['Bad'] / total_bad

    epsilon = 0.000001
    grouped['WoE'] = np.log((grouped['Good_Pct'] + epsilon) / (grouped['Bad_Pct'] + epsilon))

    grouped['IV_Contrib'] = (grouped['Good_Pct'] - grouped['Bad_Pct']) * grouped['WoE']
    iv = grouped['IV_Contrib'].sum()

    woe_df = grouped.reset_index()
    woe_df = woe_df[[feature, 'Good', 'Bad', 'Good_Pct', 'Bad_Pct', 'WoE', 'IV_Contrib']]
    woe_df.rename(columns={feature: 'Category'}, inplace=True)
    woe_df['Feature'] = feature

    return woe_df, iv

features_for_woe_iv = []
for col in train_df.columns:
    if col != 'loan_status':
        if train_df[col].dtype == 'object' or train_df[col].nunique() < 20: 
            features_for_woe_iv.append(col)

features_for_woe_iv.append('annual_inc')
if not features_for_woe_iv:
    print("No suitable features found for WoE/IV calculation.")
else:
    iv_summary = []
    woe_data = []

    for feature in features_for_woe_iv:
        print(f"\nCalculating WoE and IV for: {feature}")
        try:
            woe_df, iv = calculate_woe_iv(train_df, feature, 'loan_status')
            iv_summary.append({'Feature': feature, 'IV': iv})
            if not woe_df.empty:
                woe_data.append(woe_df)
            print(f"  IV for {feature}: {iv:.4f}")
        except ValueError as e:
            print(f"  Error calculating WoE/IV for {feature}: {e}")
        except Exception as e:
            print(f"  An unexpected error occurred for {feature}: {e}")

    if iv_summary:
        iv_summary_df = pd.DataFrame(iv_summary).sort_values(by='IV', ascending=False)
        print("\n--- Information Value (IV) Summary ---")
        print(iv_summary_df)

        print("\n--- WoE Details for each Feature ---")
        if woe_data:
            full_woe_df = pd.concat(woe_data, ignore_index=True)
            print(full_woe_df)
        else:
            print("No WoE details generated.")
    else:
        print("No IV summary available.")


Calculating WoE and IV for: term
  IV for term: 0.0916

Calculating WoE and IV for: grade
  IV for grade: 0.4532

Calculating WoE and IV for: sub_grade
  IV for sub_grade: 0.5565

Calculating WoE and IV for: emp_title
  IV for emp_title: 12.4193

Calculating WoE and IV for: emp_length
  IV for emp_length: 0.0372

Calculating WoE and IV for: home_ownership
  IV for home_ownership: 0.0811

Calculating WoE and IV for: verification_status
  IV for verification_status: 0.0367

Calculating WoE and IV for: issue_d
  IV for issue_d: 0.6175

Calculating WoE and IV for: pymnt_plan
  IV for pymnt_plan: 0.0000

Calculating WoE and IV for: purpose
  IV for purpose: 0.0000

Calculating WoE and IV for: title
  IV for title: 5.9340

Calculating WoE and IV for: zip_code
  IV for zip_code: 3.6069

Calculating WoE and IV for: addr_state
  IV for addr_state: 0.2377

Calculating WoE and IV for: delinq_2yrs
  IV for delinq_2yrs: 0.0429

Calculating WoE and IV for: earliest_cr_line
  IV for earliest_cr_line

In [12]:
df_woe_encoded = train_df.copy()

for feature in iv_summary_df['Feature']:
    woe_map = full_woe_df[full_woe_df['Feature'] == feature][['Category', 'WoE']]
    woe_dict = dict(zip(woe_map['Category'], woe_map['WoE']))
    
    df_woe_encoded[feature] = df_woe_encoded[feature].map(woe_dict)

In [13]:
selected_features = iv_summary_df[iv_summary_df['IV'] >= 0.05]['Feature'].tolist()

selected_features.append('loan_status')

In [14]:
df_final = df_woe_encoded[selected_features]

In [15]:
df_final.head()

Unnamed: 0,emp_title,title,annual_inc,zip_code,earliest_cr_line,last_credit_pull_d,last_pymnt_d,issue_d,sub_grade,grade,addr_state,out_prncp,out_prncp_inv,term,home_ownership,inq_last_6mths,loan_status
0,-7.740599,-0.832021,-0.126741,-1.928472,0.371195,-1.240094,-0.642163,-0.727216,-1.040245,-0.421245,-0.222546,0.02146,0.02146,-0.52996,-0.270513,-0.464861,1
1,5.79207,5.732652,-0.386331,-0.059435,-0.544836,0.631506,0.324384,-0.727216,-0.034122,-0.647217,0.04612,0.02146,0.02146,0.174252,-0.270513,-0.067055,0
2,-7.740599,0.148195,0.096899,-1.236945,-0.631823,-0.429229,-0.600557,-0.727216,-0.331848,-0.421245,0.04612,0.02146,0.02146,0.174252,-0.270513,-0.067055,1
3,5.79207,0.148195,-0.085413,1.594741,6.829103,-0.429229,0.169702,-0.727216,0.187196,0.183288,0.24272,0.02146,0.02146,-0.52996,0.333119,0.212239,0
4,5.79207,0.148195,7.339496,6.424178,7.116514,0.631506,0.324384,-0.727216,1.239174,0.948285,-0.024173,0.02146,0.02146,0.174252,0.333119,0.212239,0


In [16]:
print("Final dataset shape:", df_final.shape)
print("Final features used:", selected_features)

Final dataset shape: (3720, 17)
Final features used: ['emp_title', 'title', 'annual_inc', 'zip_code', 'earliest_cr_line', 'last_credit_pull_d', 'last_pymnt_d', 'issue_d', 'sub_grade', 'grade', 'addr_state', 'out_prncp', 'out_prncp_inv', 'term', 'home_ownership', 'inq_last_6mths', 'loan_status']


In [17]:
combined = pd.concat([train_df.drop(columns='loan_status'), test_df.drop(columns='loan_status')])
combined = pd.get_dummies(combined, drop_first=True)

X_train = combined.iloc[:len(train_df)]
X_test = combined.iloc[len(train_df):]
y_train = train_df['loan_status']
y_test = test_df['loan_status']

In [18]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

y_test_clean = y_test.dropna()
X_test_clean = X_test_scaled_df.loc[y_test_clean.index]

In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = df_final.drop(columns=['loan_status','out_prncp_inv','out_prncp'], axis=1)
X = add_constant(X) 
X = X.dropna()

vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

               Feature       VIF
0                const  2.140965
1            emp_title  1.719508
2                title  1.370626
3           annual_inc  1.153595
4             zip_code  1.157665
5     earliest_cr_line  1.070928
6   last_credit_pull_d  1.056206
7         last_pymnt_d  1.097234
8              issue_d  1.047852
9            sub_grade  5.773898
10               grade  5.655823
11          addr_state  1.055314
12                term  1.151242
13      home_ownership  1.099268
14      inq_last_6mths  1.050622


In [20]:
def calculate_csi(expected, actual, buckets=10):
    """
    Compute the Characteristic Stability Index (CSI) for a single feature.
    """
    expected = pd.Series(expected).dropna()
    actual = pd.Series(actual).dropna()
    
    breakpoints = np.linspace(0, 100, buckets + 1)
    bins = np.percentile(expected, breakpoints)

    bins = np.unique(bins)
    if len(bins) <= 2:
        return np.nan 

    expected_dist = pd.cut(expected, bins=bins, include_lowest=True).value_counts(normalize=True)
    actual_dist = pd.cut(actual, bins=bins, include_lowest=True).value_counts(normalize=True)

    expected_dist, actual_dist = expected_dist.align(actual_dist, fill_value=0)

    epsilon = 0.0001
    csi = ((expected_dist - actual_dist) * np.log((expected_dist + epsilon) / (actual_dist + epsilon))).sum()
    return csi


In [21]:
# X = df_final.drop(columns='loan_status')
# y = df_final['loan_status']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

csi_report = []

for col in X_train.columns:
    if col == 'const': 
        continue
    csi = calculate_csi(X_train_scaled_df[col], X_test_clean[col])
    csi_report.append({'Feature': col, 'CSI': round(csi, 4)})

csi_df = pd.DataFrame(csi_report).sort_values(by='CSI', ascending=False)
print("\n--- CSI Report ---")
print(csi_df)



--- CSI Report ---
                        Feature     CSI
35                  open_rv_24m  9.2080
29                  open_il_12m  9.2080
34                  open_rv_12m  9.2080
30                  open_il_24m  9.2059
31           mths_since_rcnt_il  9.2059
...                         ...     ...
9860  last_credit_pull_d_Sep-13     NaN
9861  last_credit_pull_d_Sep-14     NaN
9862  last_credit_pull_d_Sep-15     NaN
9863     application_type_JOINT     NaN
9864          loan_state_Closed     NaN

[9865 rows x 2 columns]


In [22]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred_train = model.predict(X_train_scaled)
acc = accuracy_score(y_train, y_pred_train) * 100
print(f"Training Accuracy: {acc:.2f}%")

y_pred_test = model.predict(X_test_clean)
y_proba = model.predict_proba(X_test_clean)[:, 1]

Training Accuracy: 100.00%




ValueError: Found array with 0 sample(s) (shape=(0, 9865)) while a minimum of 1 is required by LogisticRegression.

In [None]:
print("\n--- Classification Report ---")
print(classification_report(y_test_clean, y_pred_test))

In [None]:
accuracy = accuracy_score(y_test_clean, y_pred_test)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

In [None]:
print("\n--- Confusion Matrix ---")
sns.heatmap(confusion_matrix(y_test_clean, y_pred_test), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
auc_score = roc_auc_score(y_test_clean, y_proba)
print(f"ROC-AUC Score: {auc_score:.4f}\n")

fpr, tpr, _ = roc_curve(y_test_clean, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
test_pred = model.predict(X_test_clean)
test_df = df_test.copy()
test_df['Predicted Default'] = test_pred
test_df[['loan_amnt', 'grade', 'Predicted Default']].head()

In [23]:
print("Total test samples:", len(y_test))
print("Missing values in y_test:", y_test.isna().sum())

Total test samples: 5143
Missing values in y_test: 5143
