In [1]:
import pandas as pd
df = pd.read_csv("../data/raw/lending_club.csv", low_memory=False)
df.shape


(2260668, 145)

In [2]:
df['loan_status'].value_counts()

loan_status
Fully Paid                                             1041952
Current                                                 919695
Charged Off                                             261655
Late (31-120 days)                                       21897
In Grace Period                                           8952
Late (16-30 days)                                         3737
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     31
Name: count, dtype: int64

In [3]:
bad_status=[ "Charged Off",
    "Default",
    "Late (31-120 days)",
    "Late (16-30 days)",
    "Does not meet the credit policy. Status:Charged Off"]
df["default_label"]=df["loan_status"].apply(lambda x:1 if x in bad_status else 0)
df=df.copy()

  df["default_label"]=df["loan_status"].apply(lambda x:1 if x in bad_status else 0)


In [4]:
df['default_label'].value_counts()

default_label
0    1972587
1     288081
Name: count, dtype: int64

In [5]:
df['default_label'].value_counts(normalize=True)*100

default_label
0    87.25682
1    12.74318
Name: proportion, dtype: float64

In [6]:
missing_percent = df.isnull().mean().sort_values(ascending=False) * 100
missing_percent.head(15)


id                                            100.000000
member_id                                     100.000000
url                                           100.000000
orig_projected_additional_accrued_interest     99.627278
deferral_term                                  99.530537
hardship_start_date                            99.530537
hardship_loan_status                           99.530537
hardship_payoff_balance_amount                 99.530537
hardship_end_date                              99.530537
hardship_type                                  99.530537
hardship_last_payment_amount                   99.530537
hardship_amount                                99.530537
hardship_status                                99.530537
hardship_reason                                99.530537
hardship_dpd                                   99.530537
dtype: float64

In [7]:
df.columns


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_last_payment_amount', 'disbursement_method',
       'debt_settlement_flag', 'debt_settlement_flag_date',
       'settlement_status', 'settlement_date', 'settlement_amount',
       'settlement_percentage', 'settlement_term', 'default_label'],
      dtype='str', length=146)

In [8]:
for col in df.columns:
    print(col)
    

id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
loan_status
pymnt_plan
url
desc
purpose
title
zip_code
addr_state
dti
delinq_2yrs
earliest_cr_line
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
total_rec_int
total_rec_late_fee
recoveries
collection_recovery_fee
last_pymnt_d
last_pymnt_amnt
next_pymnt_d
last_credit_pull_d
collections_12_mths_ex_med
mths_since_last_major_derog
policy_code
application_type
annual_inc_joint
dti_joint
verification_status_joint
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
b

In [9]:
# Columns to drop (leakage, IDs, admin, sparse, post-loan)
drop_cols = [
    # Identifiers & text noise
    "id", "member_id", "url", "zip_code", "title", "desc",

    # Post-loan payment & outcome (LEAKAGE)
    "funded_amnt", "funded_amnt_inv",
    "out_prncp", "out_prncp_inv",
    "total_pymnt", "total_pymnt_inv",
    "total_rec_prncp", "total_rec_int",
    "total_rec_late_fee",
    "recoveries", "collection_recovery_fee",
    "last_pymnt_d", "last_pymnt_amnt",
    "next_pymnt_d", "last_credit_pull_d",

    # Policy / admin / meta
    "policy_code", "initial_list_status",
    "pymnt_plan", "application_type",
    "disbursement_method",

    # Joint applicant features
    "annual_inc_joint", "dti_joint",
    "verification_status_joint", "revol_bal_joint",
    "sec_app_earliest_cr_line", "sec_app_inq_last_6mths",
    "sec_app_mort_acc", "sec_app_open_acc",
    "sec_app_revol_util", "sec_app_open_act_il",
    "sec_app_num_rev_accts",
    "sec_app_chargeoff_within_12_mths",
    "sec_app_collections_12_mths_ex_med",
    "sec_app_mths_since_last_major_derog",

    # Hardship & settlement (post-distress)
    "hardship_flag", "hardship_type", "hardship_reason",
    "hardship_status", "deferral_term", "hardship_amount",
    "hardship_start_date", "hardship_end_date",
    "payment_plan_start_date", "hardship_length",
    "hardship_dpd", "hardship_loan_status",
    "orig_projected_additional_accrued_interest",
    "hardship_payoff_balance_amount",
    "hardship_last_payment_amount",
    "debt_settlement_flag", "debt_settlement_flag_date",
    "settlement_status", "settlement_date",
    "settlement_amount", "settlement_percentage",
    "settlement_term",

    # Non-feature columns
    "loan_status"   # original text label (we use default_label instead)
]


In [10]:
df_model=df.drop(columns=drop_cols,errors="ignore")

In [11]:
print("Original Columns:",df.shape[1])
print("Model columns:", df_model.shape[1])

Original Columns: 146
Model columns: 83


In [12]:
for col in df_model.columns:
    print(col)

loan_amnt
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
purpose
addr_state
dti
delinq_2yrs
earliest_cr_line
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
collections_12_mths_ex_med
mths_since_last_major_derog
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
bc_util
chargeoff_within_12_mths
delinq_amnt
mo_sin_old_il_acct
mo_sin_old_rev_tl_op
mo_sin_rcnt_rev_tl_op
mo_sin_rcnt_tl
mort_acc
mths_since_recent_bc
mths_since_recent_bc_dlq
mths_since_recent_inq
mths_since_recent_revol_delinq
num_accts_ever_120_pd
num_actv_bc_tl
num_actv_rev_tl
num_bc_sats
num_bc_tl
num_il_tl
num_op_rev_tl
num_rev_accts
num_rev_tl_bal_gt_0
num_sats
num_tl_120dpd_2m

In [13]:
"default_label" in df_model.columns

True

In [14]:
df_model["default_label"].value_counts()


default_label
0    1972587
1     288081
Name: count, dtype: int64

In [15]:
print("Rows:", df_model.shape[0])
print("Columns:", df_model.shape[1])


Rows: 2260668
Columns: 83


In [16]:
leakage_check = [
    col for col in df_model.columns 
    if "pymnt" in col or "recover" in col or "hardship" in col
]

leakage_check


[]

In [17]:
df_model.dtypes.value_counts()


float64    69
str        11
int64       3
Name: count, dtype: int64

In [18]:
df_model.describe(include="all").T.head(10)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
loan_amnt,2260668.0,,,,15046.931228,9190.245488,500.0,8000.0,12900.0,20000.0,40000.0
term,2260668.0,2.0,36 months,1609754.0,,,,,,,
int_rate,2260668.0,,,,13.092913,4.832114,5.31,9.49,12.62,15.99,30.99
installment,2260668.0,,,,445.807646,267.173725,4.93,251.65,377.99,593.32,1719.83
grade,2260668.0,7.0,B,663557.0,,,,,,,
sub_grade,2260668.0,35.0,C1,145903.0,,,,,,,
emp_title,2093699.0,512694.0,Teacher,38824.0,,,,,,,
emp_length,2113761.0,11.0,10+ years,748005.0,,,,,,,
home_ownership,2260668.0,6.0,MORTGAGE,1111450.0,,,,,,,
annual_inc,2260664.0,,,,77992.428687,112696.199574,0.0,46000.0,65000.0,93000.0,110000000.0


In [19]:
X = df_model.drop(columns=["default_label"])
y = df_model["default_label"]


In [20]:
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (2260668, 82)
y shape: (2260668,)


In [21]:
X.dtypes

loan_amnt                       int64
term                              str
int_rate                      float64
installment                   float64
grade                             str
                               ...   
tax_liens                     float64
tot_hi_cred_lim               float64
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
Length: 82, dtype: object

In [22]:
categorical_features = X.select_dtypes(include=["object", "string"]).columns.tolist()
numerical_features = X.select_dtypes(exclude=["object", "string"]).columns.tolist()

print("Categorical features:", categorical_features)
print("\nNumber of categorical features:", len(categorical_features))

print("\nNumerical features:", numerical_features[:10], "...")
print("Number of numerical features:", len(numerical_features))


Categorical features: ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'addr_state', 'earliest_cr_line']

Number of categorical features: 11

Numerical features: ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc'] ...
Number of numerical features: 71


In [23]:
X.isnull().mean().sort_values(ascending=False).head(15) * 100


mths_since_last_record            84.112837
mths_since_recent_bc_dlq          77.011175
mths_since_last_major_derog       74.309585
mths_since_recent_revol_delinq    67.250432
mths_since_last_delinq            51.246003
il_util                           47.280273
mths_since_rcnt_il                40.250227
all_util                          38.322655
open_acc_6m                       38.313012
inq_last_12m                      38.313012
total_cu_tl                       38.313012
open_rv_12m                       38.312968
open_act_il                       38.312968
open_il_12m                       38.312968
total_bal_il                      38.312968
dtype: float64

In [24]:
from sklearn.impute import SimpleImputer

# Numerical imputer
num_imputer = SimpleImputer(strategy="median")

# Categorical imputer
cat_imputer = SimpleImputer(strategy="most_frequent")


In [25]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(
    handle_unknown="ignore",
    min_frequency=0.01,   # rare categories grouped
    sparse_output=False
)


In [26]:
from sklearn.preprocessing import RobustScaler

num_scaler = RobustScaler()


In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [29]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", num_imputer),
    ("scaler", num_scaler)
])


In [30]:
categorical_pipeline = Pipeline(steps=[
    ("imputer", cat_imputer),
    ("encoder", cat_encoder)
])


In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


In [32]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [38]:
print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)


Processed train shape: (1808534, 218)
Processed test shape: (452134, 218)


In [34]:
from sklearn.linear_model import LogisticRegression


In [38]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)


In [39]:
log_reg.fit(X_train_processed, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [40]:
y_test_proba = log_reg.predict_proba(X_test_processed)[:, 1]


In [41]:
y_test_pred = log_reg.predict(X_test_processed)


In [42]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_test_proba)
print("ROC-AUC:", roc_auc)


ROC-AUC: 0.7133036824244445


In [43]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_test_pred)
cm


array([[253345, 141173],
       [ 19135,  38481]])

In [44]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.93      0.64      0.76    394518
           1       0.21      0.67      0.32     57616

    accuracy                           0.65    452134
   macro avg       0.57      0.66      0.54    452134
weighted avg       0.84      0.65      0.70    452134



In [45]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

thresholds = np.arange(0.2, 0.61, 0.05)

print("Threshold | Recall (Default) | Precision (Default)")
print("-"*50)

for t in thresholds:
    y_pred_tuned = (y_test_proba >= t).astype(int)
    
    recall = recall_score(y_test, y_pred_tuned)
    precision = precision_score(y_test, y_pred_tuned)
    
    print(f"{t:0.2f}      | {recall:0.3f}            | {precision:0.3f}")


Threshold | Recall (Default) | Precision (Default)
--------------------------------------------------
0.20      | 0.988            | 0.136
0.25      | 0.970            | 0.144
0.30      | 0.941            | 0.154
0.35      | 0.894            | 0.166
0.40      | 0.832            | 0.180
0.45      | 0.757            | 0.196
0.50      | 0.668            | 0.214
0.55      | 0.564            | 0.235
0.60      | 0.451            | 0.259


In [36]:
final_threshold = 0.40
y_test_pred_final = (y_test_proba >= final_threshold).astype(int)


NameError: name 'y_test_proba' is not defined

In [47]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_test_proba)
print("ROC-AUC:", roc_auc)


ROC-AUC: 0.7133036824244445


In [50]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_test_pred_final)
cm


array([[175909, 218609],
       [  9658,  47958]])

In [51]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_test_pred_final))


              precision    recall  f1-score   support

           0       0.95      0.45      0.61    394518
           1       0.18      0.83      0.30     57616

    accuracy                           0.50    452134
   macro avg       0.56      0.64      0.45    452134
weighted avg       0.85      0.50      0.57    452134



In [33]:
from sklearn.ensemble import RandomForestClassifier


In [34]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=50,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)


In [None]:
rf_model.fit(X_train_processed, y_train)
