In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

In [2]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Loan_Status_Preprocessed.csv')

In [3]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583.0,1508.0,128.0,360.0,1,1,0
1,1,1,0,1,1,3000.0,0.0,66.0,360.0,1,3,1
2,1,1,0,0,0,2583.0,2358.0,120.0,360.0,1,3,1
3,1,0,0,1,0,6000.0,0.0,141.0,360.0,1,3,1
4,1,1,2,1,1,5417.0,4196.0,267.0,360.0,1,3,1


In [4]:
# checkpoint
data_preprocessed_1 = data_preprocessed.copy()

In [5]:
# separate the inputs
unscaled_inputs = data_preprocessed_1.iloc[:,:-1]

In [6]:
# standardize the inputs

from sklearn.preprocessing import StandardScaler

# define scaler as an object
loan_status_scaler = StandardScaler()

In [7]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None,):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [8]:
# check what are all columns that we've got
unscaled_inputs.columns.values

array(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype=object)

In [9]:
# columns not to scale
columns_to_omit = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area',
       'Credit_History']

In [10]:
# columns to scale
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [11]:
# declare a scaler object, specifying the columns you want to scale
loan_status_scaler = CustomScaler(columns_to_scale)

In [12]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
loan_status_scaler.fit(unscaled_inputs)

CustomScaler(columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                      'Loan_Amount_Term'])

In [13]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = loan_status_scaler.transform(unscaled_inputs)

In [14]:
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,-0.137970,-0.027952,-0.208089,0.275542,1,1
1,1,1,0,1,1,-0.417536,-0.604633,-0.979001,0.275542,1,3
2,1,1,0,0,0,-0.491180,0.297100,-0.307562,0.275542,1,3
3,1,0,0,1,0,0.112280,-0.604633,-0.046446,0.275542,1,3
4,1,1,2,1,1,0.009319,0.999978,1.520245,0.275542,1,3
...,...,...,...,...,...,...,...,...,...,...,...
475,2,0,0,1,0,-0.435196,-0.604633,-0.916831,0.275542,1,1
476,1,1,3,1,0,-0.222210,-0.604633,-1.302286,-2.487549,1,1
477,1,1,1,1,0,0.478206,-0.512854,1.346168,0.275542,1,3
478,1,1,2,1,0,0.391846,-0.604633,0.525520,0.275542,1,3


In [15]:
# check the shape of the inputs
scaled_inputs.shape

(480, 11)

In [16]:
targets = data_preprocessed['Loan_Status'].to_numpy()

In [17]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [18]:
# check how this method works
train_test_split(scaled_inputs, targets)

[     Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
 260       1        0           0          1              0        -0.429015   
 254       1        1           1          1              1         0.583110   
 113       2        0           0          1              0         0.818701   
 473       1        1           2          0              0        -0.243226   
 129       1        1           0          1              0        -0.516611   
 ..      ...      ...         ...        ...            ...              ...   
 386       2        1           1          1              0        -0.359962   
 455       2        1           0          1              0        -0.388219   
 155       1        1           1          0              0        -0.488178   
 18        1        1           0          0              0        -0.488178   
 237       1        1           1          1              0        -0.591669   
 
      CoapplicantIncome  LoanAmount  L

In [19]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [20]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(384, 11) (384,)


In [21]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(96, 11) (96,)


In [22]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [23]:
# create a logistic regression object
reg = LogisticRegression()

In [24]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

LogisticRegression()

In [25]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.796875

In [26]:
# get the intercept (bias) of our model
reg.intercept_

array([-2.7026379])

In [27]:
# get the coefficients (weights) of our model
reg.coef_

array([[-0.02904011,  0.56947128,  0.03090802,  0.44106917,  0.0290772 ,
         0.24142646,  0.14249759, -0.33451535,  0.00475714,  2.83939819,
         0.20079747]])

In [28]:
# check what were the names of our columns
unscaled_inputs.columns.values

array(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype=object)

In [29]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

In [30]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Gender,-0.02904
1,Married,0.569471
2,Dependents,0.030908
3,Education,0.441069
4,Self_Employed,0.029077
5,ApplicantIncome,0.241426
6,CoapplicantIncome,0.142498
7,LoanAmount,-0.334515
8,Loan_Amount_Term,0.004757
9,Credit_History,2.839398


In [31]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-2.702638
1,Gender,-0.02904
2,Married,0.569471
3,Dependents,0.030908
4,Education,0.441069
5,Self_Employed,0.029077
6,ApplicantIncome,0.241426
7,CoapplicantIncome,0.142498
8,LoanAmount,-0.334515
9,Loan_Amount_Term,0.004757


In [32]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [33]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-2.702638,0.067028
1,Gender,-0.02904,0.971378
2,Married,0.569471,1.767332
3,Dependents,0.030908,1.031391
4,Education,0.441069,1.554368
5,Self_Employed,0.029077,1.029504
6,ApplicantIncome,0.241426,1.273064
7,CoapplicantIncome,0.142498,1.15315
8,LoanAmount,-0.334515,0.715685
9,Loan_Amount_Term,0.004757,1.004768


In [34]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
10,Credit_History,2.839398,17.105468
2,Married,0.569471,1.767332
4,Education,0.441069,1.554368
6,ApplicantIncome,0.241426,1.273064
11,Property_Area,0.200797,1.222377
7,CoapplicantIncome,0.142498,1.15315
3,Dependents,0.030908,1.031391
5,Self_Employed,0.029077,1.029504
9,Loan_Amount_Term,0.004757,1.004768
1,Gender,-0.02904,0.971378


In [35]:
# assess the test accuracy of the model
reg.score(x_test,y_test)

0.8541666666666666

In [36]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)

# let's check that out
predicted_proba

array([[0.244021  , 0.755979  ],
       [0.14032632, 0.85967368],
       [0.79223298, 0.20776702],
       [0.21606896, 0.78393104],
       [0.15642149, 0.84357851],
       [0.30508484, 0.69491516],
       [0.2149102 , 0.7850898 ],
       [0.32630572, 0.67369428],
       [0.73407785, 0.26592215],
       [0.17618908, 0.82381092],
       [0.13772374, 0.86227626],
       [0.1976835 , 0.8023165 ],
       [0.88334949, 0.11665051],
       [0.17583229, 0.82416771],
       [0.78729771, 0.21270229],
       [0.89039708, 0.10960292],
       [0.3225706 , 0.6774294 ],
       [0.21581389, 0.78418611],
       [0.86182849, 0.13817151],
       [0.28473154, 0.71526846],
       [0.35642194, 0.64357806],
       [0.25571926, 0.74428074],
       [0.22783014, 0.77216986],
       [0.23069265, 0.76930735],
       [0.17275637, 0.82724363],
       [0.83135581, 0.16864419],
       [0.78358636, 0.21641364],
       [0.19142455, 0.80857545],
       [0.26364114, 0.73635886],
       [0.2129003 , 0.7870997 ],
       [0.

In [37]:
predicted_proba.shape

(96, 2)

In [38]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

array([0.755979  , 0.85967368, 0.20776702, 0.78393104, 0.84357851,
       0.69491516, 0.7850898 , 0.67369428, 0.26592215, 0.82381092,
       0.86227626, 0.8023165 , 0.11665051, 0.82416771, 0.21270229,
       0.10960292, 0.6774294 , 0.78418611, 0.13817151, 0.71526846,
       0.64357806, 0.74428074, 0.77216986, 0.76930735, 0.82724363,
       0.16864419, 0.21641364, 0.80857545, 0.73635886, 0.7870997 ,
       0.73587865, 0.86097627, 0.64203208, 0.85957737, 0.75336821,
       0.81033837, 0.11464573, 0.8069425 , 0.60388611, 0.52818276,
       0.25764725, 0.77097902, 0.11719075, 0.70951533, 0.86301672,
       0.82091049, 0.76559076, 0.67605548, 0.8316745 , 0.79520032,
       0.80998676, 0.77932523, 0.80011036, 0.81729675, 0.76891687,
       0.65759703, 0.79329937, 0.72548159, 0.83153991, 0.79876205,
       0.76263181, 0.70189183, 0.72922155, 0.65470495, 0.71250991,
       0.85791002, 0.71677962, 0.84001495, 0.86304764, 0.95200392,
       0.22159229, 0.79888479, 0.6850495 , 0.69356831, 0.81008

In [39]:
# import the relevant module
import pickle

In [40]:
# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [41]:
# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(loan_status_scaler, file)