-
Notifications
You must be signed in to change notification settings - Fork 0
/
Banking Case Study - Logistic Regrn.R
404 lines (290 loc) · 13.6 KB
/
Banking Case Study - Logistic Regrn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
##################### REGRESSION ANALYSIS #############################
## Regression (literally means relationship)is a statistical method
## for computing relationships between a response variable &
## an independent variable.
## we model the data according to the structure of the variable we want
## to predict & compute reln. witht he help of several indep. var's
## Since,
## Correlation doesn't necesarily mean causation
## We calculate regression coeff. & interpret those coeff. to see
## the reln. between the variables.
############################# ASSUMPTIONS #############################
## A MODEL IS BOUND TO FOLLOW THESE ASSUMP.
## IF NOT WE CANNOT FIR THE REGRN.
## Regrn Model must be Linear-In-Parameters (Beta's with power == 1)
## Indept. Var = Non Stochastic in nature.....KNOWN IN ADVANCE
## Mean of Error Terms == 0
## Variance of Error Term == Constant (Sigma ^ 2)
## HOMOSCEDASTICITY (Homo = equal , scedastic = variance)
## No funnel shape
## The Error Term follows Ui ~ ND(0 , Sig ^ 2)
## Covariance B/W Indept. & Error Term == 0 (Uncorrelated)
## For (2) different Indept var's the corresp. Error Terms are Uncorrelated
## Assump of No - AUTO CORRELATION
## No EXACT LINEAR RELN. B/W two Indept. Var's
## Assump of No - MULTICOLLINEARITY
####################### BANKING CASE STUDY #######################
# Business Problem : : To predict the banking loan default
# Business Solution: : To build Logistic Regrn model
# The data consists of 1000 loan applications
# The default var. is binary & or target var.,
# Wherein, 1 = Default & 0 = No-Default
# There are == (70 %) Defaulters
# Out of all == (30 %) Non - Defaulters
## WE're going to use a logistic regrn. to predict the bank loan default
## in this case study : :
getwd()
# Loading the dataset
Loan_Data <- read.csv("E:/R Programming/Data Analysis using R CASE STUDIES/Dataset/Chapter_02_loan_default_dataset.csv",
header = T, sep = ",")
# Converting the table form to a data frame
Loan_Data_DF <- data.frame(Loan_Data)
# Performing EDA
library(dplyr)
# Both of the fn's below help us see the structure of the data
glimpse(Loan_Data_DF)
str(Loan_Data_DF)
# Dim helps us see the dimensions (Rows * Columns) of a dataset
dim(Loan_Data_DF)
# For all of the variable names
names(Loan_Data_DF)
## The first variable is of our concern
# For first & last 3 values across all variables
head(Loan_Data_DF, n = 3)
tail(Loan_Data_DF, n = 3)
# To see descriptive stats for Age
summary(Loan_Data_DF$Age)
# Wanna know if there are any missing values in the dataset?
sum(is.na(Loan_Data_DF))
##### THERE ARE NO N/A VALUES PRESENT IN OUR DATASET
# Had any N/A values present in the data, we would had to impute them
# With No missing values present, we can proceed with EDA
# To see the corrl. among var.'s
options(scipen = 999)
# This helps expand miniscule figures & removes (e) from a number..
attach(Loan_Data_DF)
cor <- cor.test(Default, Term, method = "pearson")
cor
## It suggests that Corrl != 0 & = +0.34, so there is a reln. between them
## There's a (+ve) correlation bw them
## AS Term increases, the default is also going to increase
detach(Loan_Data_DF)
###### SEEING PATTERNS IN GRAPHS
library(ggplot2)
glimpse(Loan_Data_DF)
names(Loan_Data_DF)
Amount_CR <- ggplot(Loan_Data_DF, aes(Amount, Credit_score)) +
geom_point(aes(colour = Default))
Amount_CR
## It shows a scatter plot of Amount & Cr.Score with
## colour coding of Default
## Inferences :
## Almost all people with a score of < 600 appear to be
## defaulting on a loan.
Amount_Age <- ggplot(Loan_Data_DF, aes(Age, Amount)) +
geom_histogram(stat = "identity" , fill = "red",
binwidth = 0.1)
Amount_Age
## It shows a histogram between Age & Amount of Loan Applied.
## Inferences : People in the age range [25 - 35] apply
## for Loan Amounts > $50,000
######## BUILDING THE LOGISTIC MODEL
# USING GLM
Full_Model <- glm(Default ~ ., data = Loan_Data_DF,
family = binomial(link = logit))
summary(Full_Model)
## SINCE OUR HYPOTHESIS :
## H0 : Desired Variable has no sigf. impact on Default , against
## Ha : Desired Variable has sigf. impact on Default
## We know that for all p-values <= Alpha (0.05), WE REJECT THE H0
## We know that for all p-values > Alpha (0.05), WE ACCEPT THE H0
## Our results show that, ONLY
## Checking_Amount, TERM, CREDIT_SCORE, Saving_Amount, & AGE
## HAVE SIGF. IMPACT ON THE TARGET VAR. == DEFAULT
## SO, WE HAVE TO RUN THE MODEL AGAIN BY REMOVING THE INSIGF. VAR'S
REDUCED_MODEL_A <- glm(Default ~ Checking_amount + Term +
Credit_score + Saving_amount + Age,
data = Loan_Data_DF, family = binomial(link = logit))
summary(REDUCED_MODEL_A)
## WE CAN SEE THAT OUR MODEL HAS ALL THE APPROPRAITE PREDICTOR VAR'S
## WHICH HAVE A SIGF. IMPACT ON THE TARGET VARIABLE
## ALL OF THE VAR'S ARE STATISTICALLY SIGF. AS ALL HAVE P-VALUE <= 0.05
## FOR INFERENCES ::
## CHECKING AMOUNT:
## WE SAY, FOR A ONE-UNIT INCREASE IN THE CHECKING AMOUNT, THE LOG ODDS
## OF DEFAULTING VS NON-DEFAULTING DECREASE BY (-0.004).
## MOREOVER, IT SHOWS THAT THERE'S A (-VE) RELN. BETWEEN PREDICTOR & RESP. VAR.
## IF CHECKING AMOUNT IS HIGH, THE PROBL. OF BANK DEFAULT DECREASES & VICE-VERSA
## TERM:
## WE SAY, FOR A ONE-UNIT INCREASE IN THE TERM, THE LOG ODDS
## OF DEFAULTING VS NON-DEFAULTING INCREASE BY (+0.174).
## MOREOVER, IT SHOWS THAT THERE'S A (+VE) RELN. BETWEEN PREDICTOR & RESP. VAR.
## IF THE TERM INCREASES, THE PROBL. OF BANK DEFAULT ALSO INCREASES & VICE-VERSA
## SIMILARILY FOR THE OTHER VAR'S
####### FOR VALIDATION PURPOSES & TO CHECK THE PREDICTIVE ACCRUACY
####### WE NOW SPLIT THE DATA INTO TRAINING & TEST DATASETS
####### WE HAVE TAKEN THE SPLITTING RATIO = TRAINING (70 %)
####### TEST (30 %)
?floor
# This fn. is useful in rounding the figures to the nearest largest integer
# So, a figure = 75.84 becomes figure = 76
Rounding_Obs <- floor(0.7*nrow(Loan_Data_DF))
Rounding_Obs
## Since, there are 1000 obs. we have split the data in 70 : 30 ratio
## So, 70 % i.e 700 obs. go to the TRAINING MODEL OF THE DATASET
?seq_len
## It shows the sequence of the length of obs. in a data
set.seed(2)
Loan_Train_Index <- sample(seq_len(nrow(Loan_Data_DF)), size = Rounding_Obs)
Loan_Test_Index <- -Loan_Train_Index
######## Getting the no. of obs
Loan_Train_Data <- Loan_Data_DF[Loan_Train_Index,]
Loan_Test_Data <- Loan_Data_DF[-Loan_Train_Index,]
####### Building Logistic Model using Train Data
set.seed(2)
Train_Model <- glm(Default ~ ., data = Loan_Train_Data,
family = binomial(link = logit))
summary(Train_Model)
## AGAIN REPEATING THE PROCESS OF ELIMINATING INSIGF. VAR'S
## & RERUNNING THE MODEL WITH REDUCED VAR'S, WE GET
Reduced_Train_Model <- glm(Default ~ Checking_amount + Term +
Credit_score + Saving_amount + Age,
data = Loan_Train_Data,
family = binomial(link = logit))
summary(Reduced_Train_Model)
######### AGAIN ,
## WE CAN SEE THAT OUR MODEL HAS ALL THE APPROPRAITE PREDICTOR VAR'S
## WHICH HAVE A SIGF. IMPACT ON THE TARGET VARIABLE
## ALL OF THE VAR'S ARE STATISTICALLY SIGF. AS ALL HAVE P-VALUE <= 0.05
## FOR INFERENCES ::
## CHECKING AMOUNT:
## WE SAY, FOR A ONE-UNIT INCREASE IN THE CHECKING AMOUNT, THE LOG ODDS
## OF DEFAULTING VS NON-DEFAULTING DECREASE BY (-0.004).
## MOREOVER, IT SHOWS THAT THERE'S A (-VE) RELN. BETWEEN PREDICTOR & RESP. VAR.
## IF CHECKING AMOUNT IS HIGH, THE PROBL. OF BANK DEFAULT DECREASES & VICE-VERSA
## TERM:
## WE SAY, FOR A ONE-UNIT INCREASE IN THE TERM, THE LOG ODDS
## OF DEFAULTING VS NON-DEFAULTING INCREASE BY (+0.147).
## MOREOVER, IT SHOWS THAT THERE'S A (+VE) RELN. BETWEEN PREDICTOR & RESP. VAR.
## IF THE TERM INCREASES, THE PROBL. OF BANK DEFAULT ALSO INCREASES & VICE-VERSA
## SIMILARILY FOR THE OTHER VAR'S
####### NOW ,
####### FOR CHECKING WHETHER THERE"S MULTICOLLINEARITY PRESENT IN INDEPT
####### VARIABLES OF THE TRAIN DATA
####### WE HAVE A SPECIAL TEST CALLED == V.I.F (Variance Inflation Factor)
install.packages("car")
library(car)
vif(Reduced_Train_Model)
## For var's with VIF > 5 , we say that they show multicollinearity patterns
## Since, no var's in our model have a VIF > 5 , all of em are good to go.
############THERE"S NO MUTICOLLINEARITY IN OUR DATA.
########## USING THE PREDICT FN.
########## WE are going to set a cut-off value for the probl. & Binary Value
########## For all Binary Values == 1 & Probl > 70 %
########## we say that there's a higher chance of DEFAULTS ON A LOAN
Default_Probl <- predict(Reduced_Train_Model, Loan_Test_Data,
type = "response")
########## Now, setting a fn. to generate 1 / 0 according to threshold
Default_Pred <- factor(Default_Probl > 0.7, levels = c(TRUE, FALSE),
labels = c("1 - Default", "0 - No-Default"))
View(Default_Pred)
Default_Results <- ifelse(Default_Probl > 0.70, "Default", "No-Default")
########## FOR VALIDATING THE PREDICTIVE VALUES,
########## WE USE CONFUSION MATRIX & ROC CURVES
########## This is done to see how accurately the model is predicted
######### CONFUSION MATRIX
Confusion_Matrix <- table(Default_Pred, Loan_Test_Data$Default,
dnn = c("Predicted", "Actual"))
Confusion_Matrix
## DEFAULT = +ve class
## No-DEFAULT = -ve class
## ACTUAL ( 1 ) = +ve class
## ACTUAL ( 0 ) = -ve class
## INTERPRETATION : :
############### TRUE NEGATIVE ###############
## When the ACTUAL claim was that person is Not-Defaulting ( 0 )
## & the PREDICTED claim was that person is Not-Defaulting.
## So, for both of these claims belong to the -ve class
## We say, that its a "TRUE NEGATIVE" case..
## The value corresponding to T.N case
## in our Confusion Matrix == 193.
############### FALSE NEGATIVE ###############
## When the ACTUAL claim was that person is Defaulting ( 1 )
## & the PREDICTED claim was that person is Not-Defaulting.
## So, the ACTUAL claim belongs to ( +ve ) class
## However, PREDICTED claim belongs to the -ve class
## We say, that its a "FALSE NEGATIVE" case..
## The value corresponding to F.N case
## in our Confusion Matrix == 19.
############### THIS IS TYPE - II ERROR ###############
############### FALSE POSITIVE ###############
## When the ACTUAL claim was that person is Not-Defaulting ( 0 )
## & the PREDICTED claim was that person is Defaulting.
## So, the ACTUAL claim belongs to ( -ve ) class
## However, PREDICTED claim belongs to the +ve class
## We say, that its a "FALSE POSITIVE" case..
## The value corresponding to F.P case
## in our Confusion Matrix == 3.
############### THIS IS TYPE - I ERROR ###############
############### TRUE POSITIVE ###############
## When the ACTUAL claim was that person is Defaulting ( 1 )
## & the PREDICTED claim was that person is Defaulting.
## So, both of these claims belong to the +ve class
## We say, that its a "TRUE POSITIVE" case..
## The value corresponding to T.P case
## in our Confusion Matrix == 85.
######## CONFUSION MATRIX statistics can be calculated using:
################# CODE IS TAKEN FROM R-In-ACTION BOOK ###################
Confusion_Matrix_Perf <- function(table, n=2){
if(!all(dim(table) == c(2,2)))
stop("Must be a 2 x 2 table")
tn = table[2,1]
fp = table[1,1]
fn = table[2,2]
tp = table[1,2]
sensitivity = tp/(tp+fn)
false_positive_rate = fp/(fp+tn)
specificity = tn/(tn+fp)
Precision = tp/(tp+fp)
Inaccuracy = tn/(tn+fn)
Misclassification = (fp+fn)/(tp+tn+fp+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn)
result <- paste("Sensitivity (TPR) = ", round(sensitivity, n) ,
"\nFalsy_Rate (FPR) = ", round(false_positive_rate, n),
"\nSpecificity = ", round(specificity, n),
"\nPrecision = ", round(Precision, n),
"\nInaccuracy = ", round(Inaccuracy, n),
"\nMiclassification_Rate = ", round(Misclassification, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
}
Confusion_Matrix_Perf(Confusion_Matrix)
##################################
class(Default_Pred)
table(Default_Pred)
unique(Default_Pred)
Loan_Test_Data$Default <- as.factor(Loan_Test_Data$Default)
View(Loan_Test_Data$Default)
class(Loan_Test_Data$Default)
table(Loan_Test_Data$Default)
unique(Loan_Test_Data$Default)
########### VISUALIZING USING ROC & AUC
library(ROCR)
library(caret)
# COMPUTING AUC with our models
Probl <- predict(Reduced_Train_Model, newdata = Loan_Test_Data, type = "response")
Pred <- prediction(Probl, Loan_Test_Data$Default)
Pre_AUC <- performance(Pred, measure = "tpr", x.measure = "fpr")
plot(Pre_AUC, col = "red")
## SINCE THE LINE IS CLOSE TO 1.0, we say the model is fit accurately
## Because the curve approaching toward 1 represents the
## best performance of the model.
AUC <- performance(Pred, measure = "auc")
AUC <- AUC@y.values[[1]]
AUC
## Area under Curve (AUC) values > 70% are considered
## the model with high predictive accuracy.
## In our case study of logistic regression model,
## Area under curve (AUC) value = 98 %, which is considered
## an absolute accurate model.
########################## THANK YOU ####################################