In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Train Data

In [2]:
data = pd.read_csv('bank_data.csv')
train_data = data.copy()
train_data.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [4]:
train_data = train_data.drop(['Unnamed: 0'], axis=1)
train_data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,0.877,0.0,0.0,5.0,1.0,473.0,yes


In [5]:
train_data['y'] = train_data['y'].map({'yes':1,'no':0})
train_data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.12,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0


In [7]:
y = train_data['y']
x1 = train_data[['interest_rate', 'credit', 'march', 'may', 'previous', 'duration']]

In [8]:
x = sm.add_constant(x1)

In [9]:
x

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
0,1.0,1.334,0.0,1.0,0.0,0.0,117.0
1,1.0,0.767,0.0,0.0,2.0,1.0,274.0
2,1.0,4.858,0.0,1.0,0.0,0.0,167.0
3,1.0,4.120,0.0,0.0,0.0,0.0,686.0
4,1.0,4.856,0.0,1.0,0.0,0.0,157.0
...,...,...,...,...,...,...,...
513,1.0,1.334,0.0,1.0,0.0,0.0,204.0
514,1.0,0.861,0.0,0.0,2.0,1.0,806.0
515,1.0,0.879,0.0,0.0,0.0,0.0,290.0
516,1.0,0.877,0.0,0.0,5.0,1.0,473.0


In [10]:
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.335942
         Iterations 7


In [11]:
results_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,511.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 12 Dec 2023",Pseudo R-squ.:,0.5153
Time:,12:06:29,Log-Likelihood:,-174.02
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,7.579e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1385,0.339,-0.408,0.683,-0.804,0.527
interest_rate,-0.7802,0.092,-8.471,0.000,-0.961,-0.600
credit,2.4028,1.090,2.205,0.027,0.267,4.538
march,-1.8097,0.332,-5.459,0.000,-2.459,-1.160
may,0.1946,0.229,0.849,0.396,-0.255,0.644
previous,1.2746,0.583,2.186,0.029,0.132,2.417
duration,0.0070,0.001,9.386,0.000,0.006,0.008


'May' and constant are statistically insignificant as their z-values are greater than 0.05 and can be dropped. the pseudo R-squared is 51.53% which is not a good value.

In [12]:
#new model

x = x.drop(['const'], axis=1)
x = x.drop(['may'], axis=1)
x

Unnamed: 0,interest_rate,credit,march,previous,duration
0,1.334,0.0,1.0,0.0,117.0
1,0.767,0.0,0.0,1.0,274.0
2,4.858,0.0,1.0,0.0,167.0
3,4.120,0.0,0.0,0.0,686.0
4,4.856,0.0,1.0,0.0,157.0
...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,204.0
514,0.861,0.0,0.0,1.0,806.0
515,0.879,0.0,0.0,0.0,290.0
516,0.877,0.0,0.0,1.0,473.0


In [13]:
new_log_reg = sm.Logit(y,x)
results_new_log = new_log_reg.fit()

Optimization terminated successfully.
         Current function value: 0.336668
         Iterations 7


In [14]:
results_new_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,513.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 12 Dec 2023",Pseudo R-squ.:,0.5143
Time:,12:15:27,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.185e-78

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
interest_rate,-0.8030,0.079,-10.201,0.000,-0.957,-0.649
credit,2.3459,1.071,2.190,0.029,0.246,4.445
march,-1.8387,0.315,-5.831,0.000,-2.457,-1.221
previous,1.5262,0.478,3.190,0.001,0.588,2.464
duration,0.0069,0.001,10.365,0.000,0.006,0.008


In [19]:
cm_df = pd.DataFrame(results_new_log.pred_table())
cm_df.columns = ['Predicted:0','Predicted:1']
cm_df = cm_df.rename(index={0:'Actual:0', 1:'Actual:1'})
cm_df

Unnamed: 0,Predicted:0,Predicted:1
Actual:0,218.0,41.0
Actual:1,28.0,231.0


In [20]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.8667953667953668

# Test data

In [22]:
test = pd.read_csv('Bank_data_testing.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.12,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no


In [23]:
test = test.drop(['Unnamed: 0'], axis=1)
test = test.drop(['may'], axis=1)
test

Unnamed: 0,interest_rate,credit,march,previous,duration,y
0,1.313,0.0,1.0,0.0,487.0,no
1,4.961,0.0,0.0,0.0,132.0,no
2,4.856,0.0,1.0,0.0,92.0,no
3,4.120,0.0,0.0,0.0,1468.0,yes
4,4.963,0.0,0.0,0.0,36.0,no
...,...,...,...,...,...,...
217,4.963,0.0,0.0,0.0,458.0,yes
218,1.264,0.0,1.0,0.0,397.0,yes
219,1.281,0.0,1.0,0.0,34.0,no
220,0.739,0.0,0.0,0.0,233.0,no


In [25]:
test['y'] = test['y'].map({'yes':1,'no':0})

In [26]:
test_actual = test['y']
test_actual

0      0
1      0
2      0
3      1
4      0
      ..
217    1
218    1
219    0
220    0
221    1
Name: y, Length: 222, dtype: int64

In [27]:
test_data = test.drop(['y'], axis=1)

In [28]:
test_data

Unnamed: 0,interest_rate,credit,march,previous,duration
0,1.313,0.0,1.0,0.0,487.0
1,4.961,0.0,0.0,0.0,132.0
2,4.856,0.0,1.0,0.0,92.0
3,4.120,0.0,0.0,0.0,1468.0
4,4.963,0.0,0.0,0.0,36.0
...,...,...,...,...,...
217,4.963,0.0,0.0,0.0,458.0
218,1.264,0.0,1.0,0.0,397.0
219,1.281,0.0,1.0,0.0,34.0
220,0.739,0.0,0.0,0.0,233.0


In [29]:
def confusion_matrix(data, actual_values, model):
    pred_values = model.predict(data)
    bins = np.array([0,0.5,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [31]:
cm = confusion_matrix(test_data, test_actual, results_new_log)
cm

(array([[93., 18.],
        [13., 98.]]),
 0.8603603603603603)