In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

from scipy.stats import shapiro

In [2]:
data_seti = pd.read_csv("hmelq.csv")

df = data_seti.copy()

In [3]:
df.head(5)

Unnamed: 0,bad,loan,mortdue,value,reason,job,yoj,derog,delinq,clage,ninq,clno,debtinc
0,0,81200,18834.0,108355.0,HomeImp,,28.0,0.0,0.0,139.14,0.0,14.0,34.042
1,0,12600,103960.0,127384.0,DebtCon,,2.0,0.0,0.0,129.02,0.0,25.0,34.479
2,0,18000,46865.0,61266.0,DebtCon,,5.0,0.0,0.0,102.59,2.0,9.0,26.354
3,0,10300,57676.0,71027.0,DebtCon,,19.0,0.0,0.0,157.52,1.0,11.0,33.992
4,0,9400,56508.0,78358.0,DebtCon,,17.0,0.0,0.0,141.93,0.0,11.0,32.327


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3576 entries, 0 to 3575
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   bad      3576 non-null   int64  
 1   loan     3576 non-null   int64  
 2   mortdue  3262 non-null   float64
 3   value    3512 non-null   float64
 4   reason   3429 non-null   object 
 5   job      3409 non-null   object 
 6   yoj      3264 non-null   float64
 7   derog    3149 non-null   float64
 8   delinq   3225 non-null   float64
 9   clage    3397 non-null   float64
 10  ninq     3273 non-null   float64
 11  clno     3443 non-null   float64
 12  debtinc  2809 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 363.3+ KB


In [5]:
df.isnull().sum()

bad          0
loan         0
mortdue    314
value       64
reason     147
job        167
yoj        312
derog      427
delinq     351
clage      179
ninq       303
clno       133
debtinc    767
dtype: int64

In [6]:
df.fillna(df.median(), inplace = True)
df.dropna(inplace = True)

In [7]:
df.isnull().sum()

bad        0
loan       0
mortdue    0
value      0
reason     0
job        0
yoj        0
derog      0
delinq     0
clage      0
ninq       0
clno       0
debtinc    0
dtype: int64

In [8]:
df_dummies = pd.get_dummies(df)

In [9]:
df_dummies.corr()

Unnamed: 0,bad,loan,mortdue,value,yoj,derog,delinq,clage,ninq,clno,debtinc,reason_DebtCon,reason_HomeImp,job_Mgr,job_Office,job_Other,job_ProfEx,job_Sales,job_Self
bad,1.0,-0.083244,-0.059522,-0.061372,-0.04543,0.254349,0.333551,-0.148922,0.160814,-0.027184,0.167131,-0.055917,0.055917,0.024583,-0.076394,0.063319,-0.056909,0.051219,0.030791
loan,-0.083244,1.0,0.241816,0.339235,0.082577,0.010117,-0.033151,0.095183,0.054168,0.102584,0.066768,0.200356,-0.200356,0.023102,-0.021678,-0.049894,0.006499,-0.043737,0.157131
mortdue,-0.059522,0.241816,1.0,0.82074,-0.07827,-0.054934,-0.01709,0.13115,0.027367,0.331715,0.145296,0.019648,-0.019648,0.085975,-0.055934,-0.266232,0.237845,0.015349,0.119026
value,-0.061372,0.339235,0.82074,1.0,0.000794,-0.056186,-0.019705,0.172087,-0.003162,0.281464,0.10404,0.015158,-0.015158,0.041789,-0.049968,-0.278396,0.270216,0.011003,0.151905
yoj,-0.04543,0.082577,-0.07827,0.000794,1.0,-0.055844,0.072655,0.16685,-0.048569,0.030658,-0.05792,-0.040321,0.040321,0.004698,-0.034086,0.039819,0.005606,-0.024313,-0.041685
derog,0.254349,0.010117,-0.054934,-0.056186,-0.055844,1.0,0.174155,-0.082203,0.132061,0.03076,0.025664,-0.000385,0.000385,0.012556,-0.060142,0.056882,-0.032924,0.031759,-0.003194
delinq,0.333551,-0.033151,-0.01709,-0.019705,0.072655,0.174155,1.0,0.041785,0.061985,0.137431,0.057211,-0.020263,0.020263,0.028466,0.011049,-0.01648,-0.014837,-0.018696,0.017376
clage,-0.148922,0.095183,0.13115,0.172087,0.16685,-0.082203,0.041785,1.0,-0.107105,0.222985,-0.037834,-0.044446,0.044446,-0.032185,-0.009056,-0.074558,0.109945,0.037475,-0.000299
ninq,0.160814,0.054168,0.027367,-0.003162,-0.048569,0.132061,0.061985,-0.107105,1.0,0.080471,0.119793,0.126195,-0.126195,0.071213,-0.061436,0.068504,-0.079161,-0.035249,0.015943
clno,-0.027184,0.102584,0.331715,0.281464,0.030658,0.03076,0.137431,0.222985,0.080471,1.0,0.141077,0.103289,-0.103289,0.044844,-0.0057,-0.18185,0.149368,0.036469,0.050408


In [10]:
X = df_dummies.drop("bad", axis = 1)
y = df_dummies["bad"]
X_train , X_test, y_train, y_test = train_test_split(X, y , test_size = 0.25, random_state = 42)
logistic = LogisticRegression(solver="newton-cg")

In [11]:
logistic_model = logistic.fit(X_train, y_train)



In [12]:
y_pred = logistic_model.predict(X_test)

In [13]:
logistic_model.intercept_

array([-1.59841256])

In [14]:
logistic_model.coef_

array([[-1.83192688e-05, -3.16729873e-06,  1.67163134e-06,
        -9.19870869e-03,  4.42625880e-01,  7.53876417e-01,
        -5.15046683e-03,  1.67698835e-01, -1.22779396e-02,
         6.61245020e-02, -9.22438533e-01, -4.83314515e-01,
        -3.17396064e-01, -9.47157092e-01, -3.58274078e-01,
        -4.81868429e-01,  5.56120337e-01,  1.42822278e-01]])

In [15]:
logistic_roc_auc = roc_auc_score(y_test, y_pred)

In [16]:
logistic_roc_auc

0.631544196539107

In [125]:
confusion_matrix = confusion_matrix(y_test, y_pred)

In [126]:
confusion_matrix

array([[648,  21],
       [115,  48]], dtype=int64)

In [123]:
sonuc = metrics.accuracy_score(y_test, y_pred)

In [124]:
sonuc

0.8365384615384616

In [127]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91       669
           1       0.70      0.29      0.41       163

    accuracy                           0.84       832
   macro avg       0.77      0.63      0.66       832
weighted avg       0.82      0.84      0.81       832



In [17]:
y_probs = logistic_model.predict_proba(X_test)[:,1]

In [18]:
y_probs[:12]

array([0.02096097, 0.28617709, 0.07672394, 0.13333938, 0.02759824,
       0.1107366 , 0.30119165, 0.13512214, 0.0423794 , 0.11432615,
       0.11680012, 0.30928581])

In [86]:
y_pred2 = [1 if i > 0.32 else 0 for i in y_probs]
y_pred_new = np.array(y_pred2)

In [87]:
confusion_matrix(y_test, y_pred_new)

array([[608,  61],
       [ 81,  82]], dtype=int64)

In [88]:
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90       669
           1       0.57      0.50      0.54       163

    accuracy                           0.83       832
   macro avg       0.73      0.71      0.72       832
weighted avg       0.82      0.83      0.83       832



In [37]:
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       669
           1       0.59      0.46      0.52       163

    accuracy                           0.83       832
   macro avg       0.73      0.69      0.71       832
weighted avg       0.82      0.83      0.82       832



In [91]:
sonuc = metrics.accuracy_score(y_test, y_pred_new)

In [92]:
sonuc

0.8293269230769231

In [89]:
logistic_roc_auc = roc_auc_score(y_test, y_pred_new)

In [90]:
logistic_roc_auc

0.7059433088484781

In [152]:
cross_val_score(logistic_model, X_test, y_test, cv = 10).mean()



0.8353700516351118