In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier , RandomForestClassifier , VotingClassifier
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV , StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler , MinMaxScaler , LabelEncoder , OneHotEncoder
from sklearn.metrics import confusion_matrix , classification_report , accuracy_score , f1_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_csv('credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [5]:
duplicated_values = df.duplicated()
df[duplicated_values]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15975,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4
15989,23,90000,MORTGAGE,7.0,EDUCATION,B,8000,10.36,0,0.09,N,3
15995,24,48000,MORTGAGE,4.0,MEDICAL,A,4000,5.42,0,0.08,N,4
16025,24,10000,RENT,8.0,PERSONAL,A,3000,7.90,1,0.30,N,3
16028,23,100000,MORTGAGE,7.0,EDUCATION,A,15000,7.88,0,0.15,N,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32010,42,39996,MORTGAGE,2.0,HOMEIMPROVEMENT,A,2500,5.42,0,0.06,N,12
32047,36,250000,RENT,2.0,DEBTCONSOLIDATION,A,20000,7.88,0,0.08,N,17
32172,49,120000,MORTGAGE,12.0,MEDICAL,B,12000,10.99,0,0.10,N,12
32259,39,40000,OWN,4.0,VENTURE,B,1000,10.37,0,0.03,N,16


In [6]:
df.query("person_age==23 & person_income==42000 &\
person_home_ownership=='RENT' & loan_int_rate==9.99")

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
6464,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4
15975,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4


In [7]:
print(df.shape)

(32581, 12)


In [8]:
df.drop_duplicates(inplace=True)
df.shape

(32416, 12)

In [9]:
df['loan_status'].value_counts()

0    25327
1     7089
Name: loan_status, dtype: int64

In [10]:
# df.isna().sum() * 100 / df.shape[0]

null_value = df.isna().sum()
total_null_value = df.shape[0]

np.round(null_value * 100 / total_null_value , 2)

person_age                    0.00
person_income                 0.00
person_home_ownership         0.00
person_emp_length             2.74
loan_intent                   0.00
loan_grade                    0.00
loan_amnt                     0.00
loan_int_rate                 9.55
loan_status                   0.00
loan_percent_income           0.00
cb_person_default_on_file     0.00
cb_person_cred_hist_length    0.00
dtype: float64

In [11]:
print("NUMBER-OF-DATA =",df.shape[0])
print("AFTER DROPING ALL NULL-VALUES =",df.dropna().shape[0])

NUMBER-OF-DATA = 32416
AFTER DROPING ALL NULL-VALUES = 28501


In [12]:
# HOW MUCH DATA WE LOST
total_number_of_data = 32416
after_drop = 28501

calculate = (total_number_of_data - after_drop) / total_number_of_data
print("We Lost {:.2f} Percentage Data When We Drop Null-Values".format(calculate))

We Lost 0.12 Percentage Data When We Drop Null-Values


In [13]:
df['person_age'].unique()

array([ 22,  21,  25,  23,  24,  26, 144, 123,  20,  32,  34,  29,  33,
        28,  35,  31,  27,  30,  36,  40,  50,  45,  37,  39,  44,  43,
        41,  46,  38,  47,  42,  48,  49,  58,  65,  51,  53,  66,  61,
        54,  57,  59,  62,  60,  55,  52,  64,  70,  78,  69,  56,  73,
        63,  94,  80,  84,  76,  67])

# Observation:

- Outlier in person_age [144 , 123]

In [14]:
df['person_emp_length'].unique()

array([123.,   5.,   1.,   4.,   8.,   2.,   6.,   7.,   0.,   9.,   3.,
        10.,  nan,  11.,  18.,  12.,  17.,  14.,  16.,  13.,  19.,  15.,
        20.,  22.,  21.,  24.,  23.,  26.,  25.,  27.,  28.,  31.,  41.,
        34.,  29.,  38.,  30.])

# Observation :

- Outlier in person_emp_length

In [15]:
df = df.loc[df['person_age'] < 80]
df['person_age'].unique()

array([22, 21, 25, 23, 24, 26, 20, 32, 34, 29, 33, 28, 35, 31, 27, 30, 36,
       40, 50, 45, 37, 39, 44, 43, 41, 46, 38, 47, 42, 48, 49, 58, 65, 51,
       53, 66, 61, 54, 57, 59, 62, 60, 55, 52, 64, 70, 78, 69, 56, 73, 63,
       76, 67])

In [16]:
df.shape

(32408, 12)

# WE ASSUME THAT 80 + AGE PERSON NOT TAKE LOAN SO WE REMOVE THEM.

In [17]:
df.query("person_age<=person_emp_length+14")

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
210,21,192000,MORTGAGE,123.0,VENTURE,A,20000,6.54,0,0.1,N,4


In [18]:
df = df.loc[(df['person_emp_length'] < 66) | (df['person_emp_length'].isna()) , :]
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              887
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3093
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

- WE DO NOT REMOVE OUTLIER IN PERSON_EMO_LENGTH THAT WHY WE USE | (df['person_emp_length'].isna()) .
- REMOVE DATA THAT IS NOT IMPORTANT

In [19]:
df.shape

(32406, 12)

In [20]:
df.drop('loan_percent_income',axis=1,inplace=True)
df.shape

(32406, 11)

# REMOVE OUTLIER THROUGH IQR METHOD

In [21]:
Q1 = df['person_age'].quantile(0.25)
Q3 = df['person_age'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_filtered = df[(df["person_age"] >= lower_bound) & (df["person_age"] <= upper_bound)]

In [22]:
print("BEFORE IQR METHOD :",df['person_age'].nunique())
print("AFTER IQR METHOD :",df_filtered['person_age'].nunique())

BEFORE IQR METHOD : 53
AFTER IQR METHOD : 21


# FOR COLUMNS THAT HAVE OUTLIER

In [23]:
columns = [col for col in df.columns if df[col].dtype != '0']
columns = ['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'cb_person_cred_hist_length']

In [24]:
Q1 = df[columns].quantile(0.25)
Q3 = df[columns].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

for column in columns:
    df = df[(df[column] >= lower_bound[column]) & (df[column] <= upper_bound[column])]

In [25]:
df.shape

(24003, 11)

In [26]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,N,3
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,N,3
11,21,10000,OWN,2.0,HOMEIMPROVEMENT,A,4500,8.63,1,N,2


In [44]:
# from ydata_profiling import ProfileReport

#profile = ProfileReport(df , title='updated_df' ,explorative=True)
#profile.to_file('updated_out_put.html')

In [28]:
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [29]:
numerical_columns = [col for col in df.columns if df[col].dtype != 'O']
categoricl_column = [col for col in df.columns if df[col].dtype == 'O']

In [30]:
categoricl_column

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

In [31]:
# remove loan_status
numerical_columns = ['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'cb_person_cred_hist_length']

In [32]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categoricl_column)
    ])

In [34]:
x = df.drop('loan_status',axis=1)
y = df['loan_status']

X , X_test , Y , Y_TEST = train_test_split(x,y,test_size=0.2,random_state=42)

In [35]:
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# ML Classification Models

In [36]:
model_list = {
    "LogisticRegression" : LogisticRegression(),
    "SVC" : SVC(),
    "XGBClassifier" : XGBClassifier(),
    "CatBoostClassifier" : CatBoostClassifier()
}

In [37]:
for name , model in model_list.items():

    model = model.fit(X,Y)
    y_pred = model.predict(X_test)

    print('MODEL NAME :',name)
    print('MODEL ACCURACY :')
    print(accuracy_score(Y_TEST,y_pred))

    print("*"*55)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MODEL NAME : LogisticRegression
MODEL ACCURACY :
0.8516975630077067
*******************************************************
MODEL NAME : SVC
MODEL ACCURACY :
0.9073109768798167
*******************************************************
MODEL NAME : XGBClassifier
MODEL ACCURACY :
0.9300145802957717
*******************************************************
Learning rate set to 0.036385
0:	learn: 0.6663998	total: 53.9ms	remaining: 53.8s
1:	learn: 0.6387821	total: 60.2ms	remaining: 30s
2:	learn: 0.6120906	total: 65.9ms	remaining: 21.9s
3:	learn: 0.5885885	total: 71.6ms	remaining: 17.8s
4:	learn: 0.5676565	total: 77.2ms	remaining: 15.4s
5:	learn: 0.5493307	total: 83.2ms	remaining: 13.8s
6:	learn: 0.5335667	total: 88.8ms	remaining: 12.6s
7:	learn: 0.5166294	total: 95ms	remaining: 11.8s
8:	learn: 0.5033562	total: 102ms	remaining: 11.2s
9:	learn: 0.4890026	total: 108ms	remaining: 10.7s
10:	learn: 0.4768012	total: 113ms	remaining: 10.2s
11:	learn: 0.4663008	total: 119ms	remaining: 9.83s
12:	learn: 0

In [38]:
model = model_list['XGBClassifier']

In [88]:
model_XGB = XGBClassifier()

param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

cv = StratifiedKFold(n_splits=5 , shuffle=True , random_state=42)
serch = RandomizedSearchCV(estimator=model_XGB , param_distributions=param_grid, cv=cv ,  scoring='f1' , random_state=42 , n_iter=50)

In [89]:
serch.fit(X,Y)

In [90]:
serch.best_params_

{'subsample': 1.0,
 'n_estimators': 200,
 'max_depth': 7,
 'learning_rate': 0.1,
 'colsample_bytree': 1.0}

In [91]:
model = serch.best_estimator_
model.fit(X,Y)

In [92]:
prediction = model.predict(X_test)
score = accuracy_score(Y_TEST,prediction) * 100
print('SCORE :',score)

SCORE : 92.77233909602167


In [93]:
f1 = f1_score(Y_TEST,prediction)
print('F1-SCORE :',f1)

F1-SCORE : 0.8133405056481979


In [85]:
confusion_matrix(Y_TEST,prediction)

array([[3698,   37],
       [ 310,  756]])

In [71]:
pd.crosstab(index=Y_TEST,columns=prediction , rownames=['ACTUAL'] , colnames=['PREDICTED'])

PREDICTED,0,1
ACTUAL,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3703,32
1,308,758


# CONVERT IN TO REAL FORMAT FOR UNDERSTAND

- TP = 758 FN = 308
- FP = 32  TN = 3703

In [94]:
model_cat_boost = CatBoostClassifier()

param_grid_cat = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5],
}

random_serch_cat_boost = RandomizedSearchCV(estimator=model_cat_boost , param_distributions=param_grid_cat ,
                                            cv = 5 , scoring='f1' , n_iter=50)

In [None]:
random_serch_cat_boost.fit(X,Y)

In [98]:
best_params = random_serch_cat_boost.best_params_
print("Best Hyperparameters:", best_params)

best_model = random_serch_cat_boost.best_estimator_
test_accuracy = best_model.score(X, Y)
print("Test Accuracy:", test_accuracy)

Best Hyperparameters: {'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 200, 'depth': 6}
Test Accuracy: 0.9403187168003333


In [None]:
cat_model = random_serch_cat_boost.best_estimator_
cat_model.fit(X,Y)

In [122]:
Y_PRED_cat = cat_model.predict(X_test)

accuracy = accuracy_score(Y_TEST,Y_PRED_cat)
f1 = f1_score(Y_TEST,Y_PRED_cat)
c_report = classification_report(Y_TEST,Y_PRED_cat)

print('ACCURACY :',accuracy)
print('F1-SCORE :',f1)
print('CLASSIFICATION-REPORT :')
print(c_report)

ACCURACY : 0.926890231201833
F1-SCORE : 0.8097560975609757
CLASSIFICATION-REPORT :
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      3735
           1       0.96      0.70      0.81      1066

    accuracy                           0.93      4801
   macro avg       0.94      0.85      0.88      4801
weighted avg       0.93      0.93      0.92      4801



In [123]:
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from scipy.stats import randint as sp_randint

# Define the hyperparameter search space
param_dist = {
    'depth': sp_randint(3, 10),  # Depth of the trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],  # Learning rate
    'l2_leaf_reg': sp_randint(1, 10),  # L2 regularization coefficient
    'iterations': sp_randint(100, 500),  # Number of boosting iterations
    'subsample': [0.5, 0.75, 1.0],  # Fraction of samples used for training each tree
}

cat_boost = CatBoostClassifier()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=cat_boost, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy')

In [None]:
# Perform RandomizedSearchCV
random_search.fit(X, Y)

In [126]:
best_params = random_search.best_params_
print('Best-Hyperparameters :',best_params)

Best-Hyperparameters : {'depth': 5, 'iterations': 370, 'l2_leaf_reg': 8, 'learning_rate': 0.1, 'subsample': 0.5}


In [None]:
best_model_cat_boost = random_search.best_estimator_
best_model_cat_boost.fit(X,Y)

In [131]:
y_pred_cat_boost = best_model_cat_boost.predict(X_test)
acc = accuracy_score(Y_TEST,y_pred_cat_boost)
print('CAT-BOOST-ACCURACY :',acc * 100)

CAT-BOOST-ACCURACY : 92.85565507186003


In [132]:
Y_PRED_cat = best_model_cat_boost.predict(X_test)

accuracy = accuracy_score(Y_TEST,Y_PRED_cat)
f1 = f1_score(Y_TEST,Y_PRED_cat)
c_report = classification_report(Y_TEST,Y_PRED_cat)

print('ACCURACY :',accuracy)
print('F1-SCORE :',f1)
print('CLASSIFICATION-REPORT :')
print(c_report)

ACCURACY : 0.9285565507186003
F1-SCORE : 0.8148947652455477
CLASSIFICATION-REPORT :
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      3735
           1       0.96      0.71      0.81      1066

    accuracy                           0.93      4801
   macro avg       0.94      0.85      0.89      4801
weighted avg       0.93      0.93      0.92      4801



In [134]:
X_train , X_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# UPDATE PIPELINE
# ADD MODEL IN PIPELINE

pipeline = Pipeline(steps=[
    ("preprocessor" , preprocessor),
    ("cat_boost" , best_model_cat_boost)
    ])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

In [138]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1_score = f1_score(y_test,y_pred)
print('F1-SCORE :',f1_score)

Accuracy: 0.9285565507186003
F1-SCORE : 0.8148947652455477


In [140]:
import joblib

joblib.dump(pipeline , "pipeline.joblib")

['pipeline.joblib']