# Binary Predictors in a Logistic Regression

Using the same code as in the previous exercise, find the odds of 'duration'. 

What do they tell you?

## Import the relevant libraries

In [1]:
import numpy as np 
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#Apply a fix to the statsmodels library
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [2]:
raw_data = pd.read_csv('Bank_data.csv')
raw_data

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


In [3]:
data = raw_data.copy()
data = data.drop(['Unnamed: 0'],axis=1)
data['y'] = data['y'].map({'yes':1,'no':0})
data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.120,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,0
514,0.861,0.0,0.0,2.0,1.0,806.0,1
515,0.879,0.0,0.0,0.0,0.0,290.0,0
516,0.877,0.0,0.0,5.0,1.0,473.0,1


### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [4]:
x1 = data['duration']
y = data['y']

### Simple Logistic Regression

Run the regression.

In [5]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


  return ptp(axis=axis, out=out, **kwargs)


In [6]:
results_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 08 Jun 2020",Pseudo R-squ.:,0.2121
Time:,20:18:47,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


### Find the odds of duration

In [7]:
np.exp(0.0051)

1.005113027136717

odds(duration2) = 1.005 * odds(duration1)

The odds of duration are pretty close to 1. This tells us that although duration is a significant predictor, a change in 1 day would barely affect the regression. 

Note that we could have inferred that from the coefficient itself.

Finally, note that the data is not standardized (scaled) and duration is a feature of a relatively big order of magnitude.

## Accuracy 

In [8]:
results_log.predict()

array([0.24936904, 0.42571348, 0.30019503, 0.85898342, 0.28956627,
       0.2580803 , 0.21914028, 0.16614452, 0.86976608, 0.31987434,
       0.55276776, 0.22532533, 0.30234734, 0.39480294, 0.35411888,
       0.489064  , 0.29167414, 0.25225084, 0.50567672, 0.82802726,
       0.84413091, 0.5890681 , 0.45848474, 0.40215497, 0.44961416,
       0.20546539, 0.4157464 , 0.42321571, 0.58286683, 0.2580803 ,
       0.17928751, 0.2580803 , 0.85711543, 0.43322907, 0.293791  ,
       0.32658401, 0.98373368, 0.21652721, 0.99665611, 0.62920727,
       0.2639969 , 0.49417474, 0.30885632, 0.27711089, 0.28746746,
       0.2474601 , 0.19643708, 0.30342677, 0.30559213, 0.4712014 ,
       0.87376626, 0.20132396, 0.22177596, 0.28642146, 0.25612742,
       0.49417474, 0.27916371, 0.20214723, 0.88872861, 0.53501481,
       0.23806331, 0.24556099, 0.90211616, 0.40092648, 0.51078751,
       0.45975425, 0.28019361, 0.30450837, 0.87432905, 0.23713725,
       0.18231618, 0.3706541 , 0.4724754 , 0.38144917, 0.54009

In [9]:
np.array(data['y'])

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [10]:
results_log.pred_table()

array([[204.,  55.],
       [104., 155.]])

#### Confusion matrix

In [11]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:"Actual 0",1:"Actual 1"})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,204.0,55.0
Actual 1,104.0,155.0


# Considering other parameters

In [12]:
y = data['y']
x1 = data.drop(['y'],axis=1)

In [13]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.335942
         Iterations 7


In [27]:
results_log.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.515
Dependent Variable:,y,AIC:,362.0356
Date:,2020-06-08 20:49,BIC:,391.7855
No. Observations:,518,Log-Likelihood:,-174.02
Df Model:,6,LL-Null:,-359.05
Df Residuals:,511,LLR p-value:,7.5788e-77
Converged:,1.0000,Scale:,1.0
No. Iterations:,7.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.1385,0.3395,-0.4080,0.6833,-0.8039,0.5269
interest_rate,-0.7802,0.0921,-8.4712,0.0000,-0.9607,-0.5997
credit,2.4028,1.0895,2.2053,0.0274,0.2673,4.5382
march,-1.8097,0.3315,-5.4585,0.0000,-2.4594,-1.1599
may,0.1946,0.2293,0.8488,0.3960,-0.2548,0.6440
previous,1.2746,0.5831,2.1861,0.0288,0.1319,2.4174
duration,0.0070,0.0007,9.3864,0.0000,0.0055,0.0084


In [15]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:"Actual 0",1:"Actual 1"})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,220.0,39.0
Actual 1,31.0,228.0


In [16]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.8648648648648649

## Testing model

In [17]:
test = pd.read_csv('Bank-data-testing.csv')
test

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.120,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no
...,...,...,...,...,...,...,...,...
217,217,4.963,0.0,0.0,0.0,0.0,458.0,yes
218,218,1.264,0.0,1.0,1.0,0.0,397.0,yes
219,219,1.281,0.0,1.0,0.0,0.0,34.0,no
220,220,0.739,0.0,0.0,2.0,0.0,233.0,no


In [18]:
test = test.drop(['Unnamed: 0'],axis=1)
test['y'] = test['y'].map({'yes':1,'no':0})
test

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.120,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0
...,...,...,...,...,...,...,...
217,4.963,0.0,0.0,0.0,0.0,458.0,1
218,1.264,0.0,1.0,1.0,0.0,397.0,1
219,1.281,0.0,1.0,0.0,0.0,34.0,0
220,0.739,0.0,0.0,2.0,0.0,233.0,0


In [19]:
x

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
0,1.0,1.334,0.0,1.0,0.0,0.0,117.0
1,1.0,0.767,0.0,0.0,2.0,1.0,274.0
2,1.0,4.858,0.0,1.0,0.0,0.0,167.0
3,1.0,4.120,0.0,0.0,0.0,0.0,686.0
4,1.0,4.856,0.0,1.0,0.0,0.0,157.0
...,...,...,...,...,...,...,...
513,1.0,1.334,0.0,1.0,0.0,0.0,204.0
514,1.0,0.861,0.0,0.0,2.0,1.0,806.0
515,1.0,0.879,0.0,0.0,0.0,0.0,290.0
516,1.0,0.877,0.0,0.0,5.0,1.0,473.0


In [23]:
test_actual = test['y']
test_data = test.drop(['y'],axis=1)
test_data = sm.add_constant(test_data)
test_data

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
0,1.0,1.313,0.0,1.0,0.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,0.0,132.0
2,1.0,4.856,0.0,1.0,0.0,0.0,92.0
3,1.0,4.120,0.0,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,0.0,36.0
...,...,...,...,...,...,...,...
217,1.0,4.963,0.0,0.0,0.0,0.0,458.0
218,1.0,1.264,0.0,1.0,1.0,0.0,397.0
219,1.0,1.281,0.0,1.0,0.0,0.0,34.0
220,1.0,0.739,0.0,0.0,2.0,0.0,233.0


In [24]:
def confusion_matrix(data,actual_values,model) :
    
    pred_values = model.predict(data)
    bins = np.array([0,0.5,1])
    cm = np.histogram2d(actual_values,pred_values,bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm,accuracy

In [25]:
cm = confusion_matrix(test_data,test_actual,results_log)
cm

(array([[94., 17.],
        [12., 99.]]), 0.8693693693693694)

In [26]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:"Actual 0",1:"Actual 1"})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,94.0,17.0
Actual 1,12.0,99.0
