In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('loan_dataset.csv')

In [3]:
df=data.copy()
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,,No,9600000,29900000,12,778,2400000.0,17600000.0,,8000000.0,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,,2200000.0,8800000.0,3300000.0,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000.0,,33300000.0,12800000.0,Rejected
3,4,3,,No,8200000,30700000,8,467,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


In [4]:
df.drop('loan_id',axis=1,inplace=True)

In [5]:
df.isnull().sum()

no_of_dependents              0
education                   640
self_employed               341
income_annum                  0
loan_amount                   0
loan_term                     0
cibil_score                   0
residential_assets_value    384
commercial_assets_value     128
luxury_assets_value         256
bank_asset_value            426
loan_status                   0
dtype: int64

In [6]:
numerical=[]
categorical=[]
for col in df:
    if df[col].dtype=='O':
        categorical.append(col)
        
    else:
        numerical.append(col)
        

In [7]:
print(categorical)
print()
print(numerical)

['education', 'self_employed', 'loan_status']

['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']


# <center>====Handling Missing Values====<center> 

# <center>====Random Value Imputation On Categorical columns====<center>

In [8]:
df[categorical].isnull().sum()

education        640
self_employed    341
loan_status        0
dtype: int64

In [9]:
cat_missing_col=df[categorical].iloc[:,:-1]
cat_missing_col.head()

Unnamed: 0,education,self_employed
0,,No
1,Not Graduate,Yes
2,Graduate,No
3,,No
4,Not Graduate,Yes


In [10]:
def randomvalueimpuation_cat(data,columns):
    df=data
    for col in columns:
        non_missing=df[col].dropna().values
        df[col]=df[col].apply(lambda x:np.random.choice(non_missing)if pd.isnull(x) else x)

In [11]:
randomvalueimpuation_cat(df,cat_missing_col)

In [12]:
df[categorical].isnull().sum()

education        0
self_employed    0
loan_status      0
dtype: int64

# <center>====Random Value Imputation On Numerical columns====<center>

In [13]:
df[numerical].isnull().sum()

no_of_dependents              0
income_annum                  0
loan_amount                   0
loan_term                     0
cibil_score                   0
residential_assets_value    384
commercial_assets_value     128
luxury_assets_value         256
bank_asset_value            426
dtype: int64

In [14]:
num_missing_col=df[numerical].iloc[:,5:]
num_missing_col.head()

Unnamed: 0,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2400000.0,17600000.0,,8000000.0
1,,2200000.0,8800000.0,3300000.0
2,7100000.0,,33300000.0,12800000.0
3,18200000.0,3300000.0,23300000.0,7900000.0
4,12400000.0,8200000.0,29400000.0,5000000.0


In [15]:
def randomvalueimputer_num(data,columns):
    df=data
    for col in columns:
        missing_vals=df[col].isnull().sum()
        pool=df[col].dropna().sample(missing_vals).values
        df[col][df[col].isnull()]=pool

In [16]:
randomvalueimputer_num(df,num_missing_col)

In [17]:
df[numerical].isnull().sum()

no_of_dependents            0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
dtype: int64

# <center>====Handling Outliers====<center>

In [18]:
outlier_cols=['residential_assets_value','commercial_assets_value','bank_asset_value']

In [19]:
def iqr_method(data,columns):
    data=df
    for col in columns:
        p25=df[col].quantile(0.25)
        p75=df[col].quantile(0.75)



        iqr=p75-p25
        uf=p75+1.5*iqr
        lf=p25-1.5*iqr

        df[(df[col]>uf) | (df[col]<lf)].shape

        df[(df[col]<uf) & (df[col]>lf)].shape

        df[col]=np.where(df[col]>uf,uf,np.where(df[col]<lf,lf,df[col]))




In [20]:
iqr_method(df,outlier_cols)

In [21]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000.0,17050000.0,21300000.0,8000000.0,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,200000.0,2200000.0,8800000.0,3300000.0,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000.0,2400000.0,33300000.0,12800000.0,Rejected
3,3,Not Graduate,No,8200000,30700000,8,467,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


# <center>====Incoding and Scaling====<center>

In [22]:
one=OneHotEncoder()
rs=RobustScaler()

In [23]:
col_trans=ColumnTransformer(transformers=[('rs',RobustScaler(),numerical),
                                          ('ohe',OneHotEncoder(drop='first',handle_unknown='ignore')
                                           ,categorical)],
                                remainder='passthrough')

In [24]:
col_trans

In [25]:
df_inc=col_trans.fit_transform(df)

In [26]:
df_incoded=pd.DataFrame(df_inc,columns=col_trans.get_feature_names_out())
df_incoded.head()

Unnamed: 0,rs__no_of_dependents,rs__income_annum,rs__loan_amount,rs__loan_term,rs__cibil_score,rs__residential_assets_value,rs__commercial_assets_value,rs__luxury_assets_value,rs__bank_asset_value,ohe__education_ Not Graduate,ohe__self_employed_ Yes,ohe__loan_status_ Rejected
0,-0.333333,0.9375,1.115942,0.2,0.60339,-0.344444,2.119048,0.478873,0.744681,0.0,0.0,0.0
1,-1.0,-0.208333,-0.166667,-0.2,-0.620339,-0.588889,-0.238095,-0.401408,-0.255319,1.0,1.0,1.0
2,0.0,0.833333,1.101449,1.0,-0.318644,0.177778,-0.206349,1.323944,1.765957,0.0,0.0,1.0
3,0.0,0.645833,1.173913,-0.2,-0.450847,1.411111,-0.063492,0.619718,0.723404,1.0,0.0,1.0
4,0.666667,0.979167,0.702899,1.0,-0.738983,0.766667,0.714286,1.049296,0.106383,1.0,1.0,1.0


In [27]:
df_incoded.columns=df.columns

In [28]:
df_incoded.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,-0.333333,0.9375,1.115942,0.2,0.60339,-0.344444,2.119048,0.478873,0.744681,0.0,0.0,0.0
1,-1.0,-0.208333,-0.166667,-0.2,-0.620339,-0.588889,-0.238095,-0.401408,-0.255319,1.0,1.0,1.0
2,0.0,0.833333,1.101449,1.0,-0.318644,0.177778,-0.206349,1.323944,1.765957,0.0,0.0,1.0
3,0.0,0.645833,1.173913,-0.2,-0.450847,1.411111,-0.063492,0.619718,0.723404,1.0,0.0,1.0
4,0.666667,0.979167,0.702899,1.0,-0.738983,0.766667,0.714286,1.049296,0.106383,1.0,1.0,1.0


# <center>====Building Model====</center>

In [29]:
x=df_incoded.drop('loan_status',axis=1)
y=df_incoded['loan_status']

In [30]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.3)

In [31]:
x_train.shape,x_test.shape

((1280, 11), (2989, 11))

In [32]:
y_train.shape,y_test.shape

((1280,), (2989,))

In [33]:
rfc=RandomForestClassifier()

In [34]:
dict_param={
    'criterion':['gini','entropy'],
    'n_estimators':[100,200,300,400,500],
    'max_depth':[2,3,5,7],
    'max_leaf_nodes':[5,6,7],
    'max_features':['auto','sqrt','log2']
}

In [35]:
rns=RandomizedSearchCV(estimator=rfc,param_distributions=dict_param,n_iter=6,cv=10)

In [36]:
rns.fit(x_train,y_train)

In [37]:
y_pred=rns.predict(x_test)
y_pred

array([0., 0., 0., ..., 0., 1., 0.])

In [38]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97      1822
         1.0       0.97      0.94      0.95      1167

    accuracy                           0.96      2989
   macro avg       0.96      0.96      0.96      2989
weighted avg       0.96      0.96      0.96      2989



In [39]:
print("best score:",rns.best_score_)
print()
print("Best params:",rns.best_params_)
print()
print("Best estimators:",rns.best_estimator_)



best score: 0.959375

Best params: {'n_estimators': 100, 'max_leaf_nodes': 7, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'gini'}

Best estimators: RandomForestClassifier(max_depth=5, max_leaf_nodes=7)


# <center>Boosting Techniques<center>

In [40]:
xgc=XGBRFClassifier()

In [41]:
xgc.get_params()

{'colsample_bynode': 0.8,
 'learning_rate': 1.0,
 'reg_lambda': 1e-05,
 'subsample': 0.8,
 'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [42]:
xg_param={ 'n_estimatores': [2000,3000,4000,5000],
          'gamma': [0.5,0.8,0.1],
          'reg_lambda':[0.5,0.8,0.1]}

In [43]:
rns2=RandomizedSearchCV(estimator=xgc,param_distributions=xg_param,n_iter=6,cv=10)

In [44]:
rns2.fit(x_train,y_train)

In [45]:
y_pred2=rns2.predict(x_test)
y_pred2

array([0, 0, 0, ..., 0, 1, 0])

In [46]:
print(classification_report(y_pred2,y_test))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1859
           1       0.96      0.96      0.96      1130

    accuracy                           0.97      2989
   macro avg       0.97      0.97      0.97      2989
weighted avg       0.97      0.97      0.97      2989



In [47]:
print("best score:",rns2.best_score_)
print()
print("Best params:",rns2.best_params_)

best score: 0.97421875

Best params: {'reg_lambda': 0.1, 'n_estimatores': 2000, 'gamma': 0.1}


# <center>Gradient Boosting<center>

In [48]:
gb=GradientBoostingClassifier()

In [49]:
gb.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [50]:
gb_param={ 'n_estimators': [2000,3000,4000,5000],
          'learning_rate': [0.8,0.1],
          'max_depth':[5,9]}

In [51]:
rns3=RandomizedSearchCV(estimator=gb,param_distributions=gb_param,n_iter=6,cv=10)

In [None]:
rns3.fit(x_train,y_train)

In [None]:
y_pred3=rns3.predict(x_test)
y_pred3

In [None]:
print(classification_report(y_pred2,y_test))

In [None]:
print("best score:",rns3.best_score_)
print()
print("Best params:",rns3.best_params_)