In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

## Import the Data

In [114]:
raw_data = pd.read_csv('Bank_data.csv')
raw_data.head()
#noticed that 'y' requires a dummy

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [36]:
# change 'y' to dummy variable
data = raw_data.copy()
data['y'] = data['y'].map({'yes':1,'no':0})

## Plot the data

In [37]:
# mark the target
# we want the effect of duration on 'y'
y = data['y']
x1 = data['duration']
data.describe()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,258.5,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,149.677988,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.0,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,129.25,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,258.5,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,387.75,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,517.0,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


## Create a logistic regression for dataset

In [39]:
# add constant to independent variables
x = sm.add_constant(x1)
# declare the regressor
reg_duration = sm.Logit(y,x)
# create the regression
result_duration = reg_duration.fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


In [40]:
result_duration.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.2121
Time:,14:44:54,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


In [41]:
# the Log-Likelihood is a large negative number so the variable is significant
# the LL-Null is different from the Log-Likelihood which makes it not as significant
# the LLR p-value is below 0.000 making it sigificant 
# Pseudo R-squred is more useful comparing different models

## Find the odds of duration

In [59]:
duration1 = data['duration'][1]
duration2 = data['duration'][2]
coef = .0051
odds = np.exp(coef)
odds
#with oddss this close to 1, it means that a big change in days is a factor
# as each day will only have a .005% difference

1.005113027136717

In [60]:
duration_dif = duration2-duration1
odds_duration2 = np.exp(coef*duration_dif)
odds_duration2

0.5794360361196035

In [65]:
print (
    'The odds of ' + str(data['duration'][2]) + 
    ' duration happening over ' + 
    str(data['duration'][1]) + 
    ' is ' + 
    str(odds_duration2) + '%. with a net change of '
    + str(round(odds-1,3)) + ' per day.'
)

The odds of 167.0 duration happening over 274.0 is 0.5794360361196035%. with a net change of 0.005 per day.


## Accuracy

In [69]:
#lets test the accuracy of our model

cm = result_duration.pred_table()
cm
cm_df = pd.DataFrame(cm,columns=['Predict 0','Predict 1'])

In [71]:
cm_df.rename(index={0:'Actual 0',1:'Actual 1'})

Unnamed: 0,Predict 0,Predict 1
Actual 0,204.0,55.0
Actual 1,104.0,155.0


In [72]:
# We have 204 that is 0 when true value was 0 and 155 that is 1 when true value was 1

In [75]:
acc_duration = round((cm[1,1]+cm[0,0])/cm.sum(),3)
acc_duration

0.693

In [76]:
# the model is 69.3% accurate

# Testing the Model against Test Dataset

### Loading in the test data

In [77]:
test_raw = pd.read_csv('Bank_data_testing.csv')

In [115]:
# 'y' needs a dummy variable
test_data = test_raw.copy()
test_data['y'] = test_data['y'].map({'yes':1,'no':0})

### Determine the independent and Dependent Variables for Test Data

In [98]:
y_actual = test_data['y']
x_actual = sm.add_constant(test_data['duration'])

In [106]:
predicted_values = result_duration.predict(x_actual)
cm = np.histogram2d(y_actual,predicted_values,bins=np.array([0,0.55,1]))[0]

In [110]:
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
missclassification = (cm[1,0]+cm[0,1])/cm.sum()

In [113]:
print(
    'Accuracy: '+str(round(accuracy,3))+
    ' and Missclassification Rate: '+str(round(missclassification,3))
)

Accuracy: 0.721 and Missclassification Rate: 0.279
