In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/credit_scoring_sample.csv',sep=';')

In [5]:
data

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.000000,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.000190,0,0,10500.0,2.0
4,1,49,0,0.271820,0,0,400.0,0.0
...,...,...,...,...,...,...,...,...
45058,1,31,0,0.824725,0,0,3000.0,1.0
45059,0,49,0,6530.000000,0,0,0.0,5.0
45060,1,38,0,0.475841,0,0,3000.0,2.0
45061,0,47,1,0.485198,0,0,11720.0,5.0


In [6]:
X , y = data.drop('SeriousDlqin2yrs',axis=1) , data.SeriousDlqin2yrs
X.shape , y.shape

((45063, 7), (45063,))

In [8]:
X.isna().sum()

age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
NumberOfTimes90DaysLate                    0
NumberOfTime60-89DaysPastDueNotWorse       0
MonthlyIncome                           8643
NumberOfDependents                      1117
dtype: int64

In [11]:
X['MonthlyIncome'] = X['MonthlyIncome'].fillna(X['MonthlyIncome'].median())
X['NumberOfDependents'] = X['NumberOfDependents'].fillna(X['NumberOfDependents'].median())

In [14]:
X.info() , X.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45063 entries, 0 to 45062
Data columns (total 7 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   45063 non-null  int64  
 1   NumberOfTime30-59DaysPastDueNotWorse  45063 non-null  int64  
 2   DebtRatio                             45063 non-null  float64
 3   NumberOfTimes90DaysLate               45063 non-null  int64  
 4   NumberOfTime60-89DaysPastDueNotWorse  45063 non-null  int64  
 5   MonthlyIncome                         45063 non-null  float64
 6   NumberOfDependents                    45063 non-null  float64
dtypes: float64(3), int64(4)
memory usage: 2.4 MB


(None,
 age                                     0
 NumberOfTime30-59DaysPastDueNotWorse    0
 DebtRatio                               0
 NumberOfTimes90DaysLate                 0
 NumberOfTime60-89DaysPastDueNotWorse    0
 MonthlyIncome                           0
 NumberOfDependents                      0
 dtype: int64)

In [16]:
X_train , X_valid , y_train , y_valid = train_test_split(X,y,test_size=0.3,random_state=17)
X_train.shape , X_valid.shape , y_train.shape , y_valid.shape

((31544, 7), (13519, 7), (31544,), (13519,))

In [36]:
%%time
logit = LogisticRegression(random_state=17)
logit.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,logit.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,logit.predict(X_valid)))

Roc auc score : 0.546023
Accuracy score : 0.794659
Wall time: 164 ms


In [37]:
%%time
params = {
    'C' : [0.0001,0.001,0.01,0.1,1,10]
}
grid_search_cv_log = GridSearchCV(estimator=LogisticRegression(random_state=17),param_grid=params,\
                                  n_jobs=-1,cv=10)
grid_search_cv_log.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,grid_search_cv_log.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,grid_search_cv_log.predict(X_valid)))
print(grid_search_cv_log.best_estimator_)

Roc auc score : 0.560320
Accuracy score : 0.799467
LogisticRegression(C=0.01, random_state=17)
Wall time: 3.04 s


In [38]:
%%time
forest = RandomForestClassifier(random_state=17)
forest.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,forest.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,forest.predict(X_valid)))

Roc auc score : 0.703966
Accuracy score : 0.827724
Wall time: 3.12 s


In [39]:
%%time 
params = {
    'max_depth':range(1,22,5),
    'n_estimators':range(50,201,50),
    'max_features':range(1,32,5)
}
grid_seacrh_cv_forest = GridSearchCV(estimator=RandomForestClassifier(random_state=17),param_grid=params,n_jobs=-1,cv=10)
grid_seacrh_cv_forest.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,grid_seacrh_cv_forest.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,grid_seacrh_cv_forest.predict(X_valid)))

Roc auc score : 0.713695
Accuracy score : 0.840077
Wall time: 2min 44s


In [44]:
%%time 
params = {
    'max_depth':range(1,22,5),
    'n_estimators':range(50,201,50),
    'max_features':range(1,32,5)
}
random_search_cv_forest = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=17),param_distributions = params,\
                                             n_jobs=-1,cv=10)
random_search_cv_forest.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,random_search_cv_forest.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,random_search_cv_forest.predict(X_valid)))

Roc auc score : 0.713039
Accuracy score : 0.838302
Wall time: 15.6 s


In [48]:
%%time
bagging = BaggingClassifier(base_estimator=LogisticRegression(random_state=17),n_jobs=-1)
bagging.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,bagging.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,bagging.predict(X_valid)))

Roc auc score : 0.561746
Accuracy score : 0.801317
Wall time: 1.34 s


In [56]:
%%time
param = {
    "max_features": [2, 3, 4],
    "max_samples": [0.5, 0.7, 0.9],
    'n_estimators':range(50,251,50)
}
random_search_cv_bagging = RandomizedSearchCV(\
                                              estimator=BaggingClassifier(base_estimator = BaggingClassifier\
                                                                          (base_estimator=LogisticRegression\
                                                                           (random_state=17))),
                                             param_distributions=param,n_jobs=-1,cv=10)
random_search_cv_bagging.fit(X_train,y_train)
print('Roc auc score : %f'%roc_auc_score(y_valid,random_search_cv_bagging.predict(X_valid)))
print('Accuracy score : %f'%accuracy_score(y_valid,random_search_cv_bagging.predict(X_valid)))

Roc auc score : 0.528293
Accuracy score : 0.790517
Wall time: 35min 3s
