Import libraries and functions

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression


Import dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_excel("/content/drive/My Drive/Credit Scoring Project/Credit Scoring Dataset.xlsx")

Data Preparation

In [4]:
data.shape

(3000, 30)

In [5]:
data.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [6]:
#drop the cusomer ID because it is not involved
data = data.drop('ID', axis=1)
data.shape

(3000, 29)

In [7]:
#explore missing values
data.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [8]:
#filling missing with mean
data = data.fillna(data.mean())

In [9]:
#do double check
data.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

Train Test Split

In [10]:
y = data.iloc[:,0].values
X = data.iloc[:,1:28].values

In [11]:
#splitting dataset into training and test(in ratio 80:20)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

# Risk Model Building

In [12]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Model performance


In [13]:
print(confusion_matrix(y_test, y_pred))

[[489   9]
 [ 93   9]]


In [14]:
print(accuracy_score(y_test,y_pred))

0.83


# Writing output file


In [16]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.07521274, 0.92478726],
       [0.92015909, 0.07984091],
       [0.66526717, 0.33473283],
       ...,
       [0.76818301, 0.23181699],
       [0.63199333, 0.36800667],
       [0.92611491, 0.07388509]])

In [18]:
#writing model output file

df_prediction_prob =pd.DataFrame(predictions, columns=['prob_0','prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset =pd.DataFrame(y_test, columns=['Actual Outcome'])

dfx = pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)
dfx.to_csv("/content/drive/My Drive/Credit Scoring Project/Model_Prediction.xlsx", sep='-', encoding='UTF8')

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.075213,0.924787,1
1,0,0.920159,0.079841,0
2,0,0.665267,0.334733,0
3,0,0.944314,0.055686,0
4,0,0.877778,0.122222,0
