In [175]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier , RandomForestClassifier , VotingClassifier
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler , MinMaxScaler , LabelEncoder , OneHotEncoder
from sklearn.metrics import confusion_matrix , classification_report , accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [176]:
df = pd.read_csv('credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [177]:
duplicated_values = df.duplicated()
df[duplicated_values]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15975,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4
15989,23,90000,MORTGAGE,7.0,EDUCATION,B,8000,10.36,0,0.09,N,3
15995,24,48000,MORTGAGE,4.0,MEDICAL,A,4000,5.42,0,0.08,N,4
16025,24,10000,RENT,8.0,PERSONAL,A,3000,7.90,1,0.30,N,3
16028,23,100000,MORTGAGE,7.0,EDUCATION,A,15000,7.88,0,0.15,N,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32010,42,39996,MORTGAGE,2.0,HOMEIMPROVEMENT,A,2500,5.42,0,0.06,N,12
32047,36,250000,RENT,2.0,DEBTCONSOLIDATION,A,20000,7.88,0,0.08,N,17
32172,49,120000,MORTGAGE,12.0,MEDICAL,B,12000,10.99,0,0.10,N,12
32259,39,40000,OWN,4.0,VENTURE,B,1000,10.37,0,0.03,N,16


In [178]:
df.query("person_age==23 & person_income==42000 &\
person_home_ownership=='RENT' & loan_int_rate==9.99")

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
6464,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4
15975,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4


In [179]:
print(df.shape)

(32581, 12)


In [180]:
df.drop_duplicates(inplace=True)
df.shape

(32416, 12)

In [181]:
df['loan_status'].value_counts()

loan_status
0    25327
1     7089
Name: count, dtype: int64

In [182]:
# df.isna().sum() * 100 / df.shape[0]

null_value = df.isna().sum()
total_null_value = df.shape[0]

np.round(null_value * 100 / total_null_value , 2)

person_age                    0.00
person_income                 0.00
person_home_ownership         0.00
person_emp_length             2.74
loan_intent                   0.00
loan_grade                    0.00
loan_amnt                     0.00
loan_int_rate                 9.55
loan_status                   0.00
loan_percent_income           0.00
cb_person_default_on_file     0.00
cb_person_cred_hist_length    0.00
dtype: float64

In [183]:
print("NUMBER-OF-DATA =",df.shape[0])
print("AFTER DROPING ALL NULL-VALUES =",df.dropna().shape[0])

NUMBER-OF-DATA = 32416
AFTER DROPING ALL NULL-VALUES = 28501


In [184]:
# HOW MUCH DATA WE LOST
total_number_of_data = 32416
after_drop = 28501

calculate = (total_number_of_data - after_drop) / total_number_of_data
print("We Lost {:.2f} Percentage Data When We Drop Null-Values".format(calculate))

We Lost 0.12 Percentage Data When We Drop Null-Values


In [185]:
df['person_age'].unique()

array([ 22,  21,  25,  23,  24,  26, 144, 123,  20,  32,  34,  29,  33,
        28,  35,  31,  27,  30,  36,  40,  50,  45,  37,  39,  44,  43,
        41,  46,  38,  47,  42,  48,  49,  58,  65,  51,  53,  66,  61,
        54,  57,  59,  62,  60,  55,  52,  64,  70,  78,  69,  56,  73,
        63,  94,  80,  84,  76,  67], dtype=int64)

# Observation:

- Outlier in person_age [144 , 123]

In [186]:
df['person_emp_length'].unique()

array([123.,   5.,   1.,   4.,   8.,   2.,   6.,   7.,   0.,   9.,   3.,
        10.,  nan,  11.,  18.,  12.,  17.,  14.,  16.,  13.,  19.,  15.,
        20.,  22.,  21.,  24.,  23.,  26.,  25.,  27.,  28.,  31.,  41.,
        34.,  29.,  38.,  30.])

# Observation :

- Outlier in person_emp_length

In [187]:
df = df.loc[df['person_age'] < 80]
df['person_age'].unique()

array([22, 21, 25, 23, 24, 26, 20, 32, 34, 29, 33, 28, 35, 31, 27, 30, 36,
       40, 50, 45, 37, 39, 44, 43, 41, 46, 38, 47, 42, 48, 49, 58, 65, 51,
       53, 66, 61, 54, 57, 59, 62, 60, 55, 52, 64, 70, 78, 69, 56, 73, 63,
       76, 67], dtype=int64)

In [188]:
df.shape

(32408, 12)

# WE ASSUME THAT 80 + AGE PERSON NOT TAKE LOAN SO WE REMOVE THEM.

In [189]:
df.query("person_age<=person_emp_length+14")

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
210,21,192000,MORTGAGE,123.0,VENTURE,A,20000,6.54,0,0.1,N,4


In [190]:
df = df.loc[(df['person_emp_length'] < 66) | (df['person_emp_length'].isna()) , :]
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              887
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3093
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

- WE DO NOT REMOVE OUTLIER IN PERSON_EMO_LENGTH THAT WHY WE USE | (df['person_emp_length'].isna()) .
- REMOVE DATA THAT IS NOT IMPORTANT

In [191]:
df.shape

(32406, 12)

In [192]:
df.drop('loan_percent_income',axis=1,inplace=True)
df.shape

(32406, 11)

# REMOVE OUTLIER THROUGH IQR METHOD

In [193]:
Q1 = df['person_age'].quantile(0.25)
Q3 = df['person_age'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_filtered = df[(df["person_age"] >= lower_bound) & (df["person_age"] <= upper_bound)]

In [194]:
print("BEFORE IQR METHOD :",df['person_age'].nunique())
print("AFTER IQR METHOD :",df_filtered['person_age'].nunique())

BEFORE IQR METHOD : 53
AFTER IQR METHOD : 21


# FOR COLUMNS THAT HAVE OUTLIER

In [195]:
columns = [col for col in df.columns if df[col].dtype != '0']
columns = ['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'cb_person_cred_hist_length']

In [196]:
Q1 = df[columns].quantile(0.25)
Q3 = df[columns].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

for column in columns:
    df = df[(df[column] >= lower_bound[column]) & (df[column] <= upper_bound[column])]

In [197]:
df.shape

(24003, 11)

In [198]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,N,3
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,N,3
11,21,10000,OWN,2.0,HOMEIMPROVEMENT,A,4500,8.63,1,N,2


In [199]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df , title='updated_df' ,explorative=True)
profile.to_file('updated_out_put.html')



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [200]:
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [201]:
numerical_columns = [col for col in df.columns if df[col].dtype != 'O']
categoricl_column = [col for col in df.columns if df[col].dtype == 'O']

In [202]:
categoricl_column

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

In [203]:
# remove loan_status
numerical_columns = ['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'cb_person_cred_hist_length']

In [209]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [211]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categoricl_column)
    ])

In [212]:
x = df.drop('loan_status',axis=1)
y = df['loan_status']

X , X_test , Y , Y_TEST = train_test_split(x,y,test_size=0.2,random_state=42)

In [213]:
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# ML Classification Models

In [216]:
model_list = {
    "LogisticRegression" : LogisticRegression(),
    "SVC" : SVC(),
    "XGBClassifier" : XGBClassifier(),
    "CatBoostClassifier" : CatBoostClassifier()
}

In [218]:
for name , model in model_list.items():
    
    model = model.fit(X,Y)
    y_pred = model.predict(X_test)
    
    print('MODEL NAME :',name)
    print('MODEL ACCURACY :')
    print(accuracy_score(Y_TEST,y_pred))
    
    print("*"*55)

MODEL NAME : LogisticRegression
MODEL ACCURACY :
0.8521141428868986
*******************************************************
/
MODEL NAME : SVC
MODEL ACCURACY :
0.9073109768798167
*******************************************************
/
MODEL NAME : XGBClassifier
MODEL ACCURACY :
0.9300145802957717
*******************************************************
/
Learning rate set to 0.036385
0:	learn: 0.6663978	total: 158ms	remaining: 2m 37s
1:	learn: 0.6387838	total: 166ms	remaining: 1m 22s
2:	learn: 0.6120911	total: 175ms	remaining: 58.1s
3:	learn: 0.5885878	total: 183ms	remaining: 45.6s
4:	learn: 0.5676563	total: 192ms	remaining: 38.3s
5:	learn: 0.5493314	total: 202ms	remaining: 33.4s
6:	learn: 0.5335673	total: 213ms	remaining: 30.2s
7:	learn: 0.5166297	total: 223ms	remaining: 27.6s
8:	learn: 0.5033563	total: 232ms	remaining: 25.6s
9:	learn: 0.4890025	total: 241ms	remaining: 23.8s
10:	learn: 0.4768010	total: 250ms	remaining: 22.5s
11:	learn: 0.4663005	total: 258ms	remaining: 21.3s
12:	lear

In [220]:
model = model_list['XGBClassifier']

In [221]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}


serch = RandomizedSearchCV(estimator=model , param_distributions=param_grid, cv=3 ,  scoring='accuracy')

In [223]:
serch.fit(X,Y)

In [227]:
serch.best_params_

{'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1}

In [228]:
model = serch.best_estimator_
model.fit(X,Y)

In [231]:
prediction = model.predict(X_test)
score = accuracy_score(Y_TEST,prediction) * 100
print('SCORE :',score)

SCORE : 92.93897104769839
