# Logistic Regression 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [4]:
df=pd.read_csv("secusystem.csv")

In [12]:
df.shape

(48, 3)

In [7]:
df.head(10)

Unnamed: 0,IncomeKs,Sqft,SecuritySystemOwner
0,130.5,4743.6,owner
1,117.45,4554.8,owner
2,76.5,4422.0,owner
3,97.2,4341.6,owner
4,111.78,4323.2,owner
5,68.85,4246.0,owner
6,92.25,4180.8,owner
7,87.48,4168.8,owner
8,103.5,4020.0,owner
9,121.5,4020.0,owner


In [9]:
df.SecuritySystemOwner.value_counts().reset_index()

Unnamed: 0,index,SecuritySystemOwner
0,owner,24
1,nonowner,24


In [10]:
df.tail()

Unnamed: 0,IncomeKs,Sqft,SecuritySystemOwner
43,77.22,2400.0,nonowner
44,74.25,2400.0,nonowner
45,81.9,2220.0,nonowner
46,78.75,2220.0,nonowner
47,63.75,2100.0,nonowner


In [16]:
df.dtypes

IncomeKs               float64
Sqft                   float64
SecuritySystemOwner     object
dtype: object

In [17]:
df.describe()

Unnamed: 0,IncomeKs,Sqft
count,48.0,48.0
mean,93.218437,3268.7
std,30.29805,756.446139
min,41.25,2100.0
25%,68.7975,2640.0
50%,86.64,3090.0
75%,112.69125,4015.8
max,165.15,4743.6


In [100]:
df['SecuritySystemOwner'].replace({'owner':0,'nonowner':1}, inplace=True)
df.head()

Unnamed: 0,IncomeKs,Sqft,SecuritySystemOwner
0,130.5,4743.6,0
1,117.45,4554.8,0
2,76.5,4422.0,0
3,97.2,4341.6,0
4,111.78,4323.2,0


In [28]:
df.corr()

Unnamed: 0,IncomeKs,Sqft,SecuritySystemOwner
IncomeKs,1.0,0.195825,-0.668209
Sqft,0.195825,1.0,-0.616145
SecuritySystemOwner,-0.668209,-0.616145,1.0


In [60]:
Data_x= df[['IncomeKs', 'Sqft']]
Data_y=df[['SecuritySystemOwner']]

In [61]:
import statsmodels.api as sm
logit_model=sm.Logit(Data_y,sm.add_constant(Data_x))
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.077241
         Iterations 12
                            Logit Regression Results                           
Dep. Variable:     SecuritySystemOwner   No. Observations:                   48
Model:                           Logit   Df Residuals:                       45
Method:                            MLE   Df Model:                            2
Date:                 Sat, 12 Nov 2022   Pseudo R-squ.:                  0.8886
Time:                         12:15:00   Log-Likelihood:                -3.7076
converged:                        True   LL-Null:                       -33.271
Covariance Type:             nonrobust   LLR p-value:                 1.448e-13
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         89.7547     52.058      1.724      0.085     -12.276     191.786
IncomeKs      -0.486

# Train Test split

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split  
from sklearn import metrics

Dx_train, Dx_test, Dy_train, Dy_test = train_test_split(Data_x, Data_y, test_size=0.5, random_state=100)
logisticreg = LogisticRegression(solver='lbfgs')
logisticreg.fit(Dx_train, np.ravel(Dy_train))

LogisticRegression()

In [53]:
score= logisticreg.score(Dx_test,Dy_test)
score

0.9583333333333334

In [54]:
y_pred= logisticreg.predict(Dx_test)

In [55]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Dy_test, y_pred)
print(confusion_matrix)

[[11  0]
 [ 1 12]]


# Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree


In [65]:
DTX_train, DTX_test, DTY_train, DTY_test = train_test_split(Data_x, Data_y, test_size=0.5, random_state=100)

In [66]:
dtclf3 = tree.DecisionTreeClassifier()
dtclf3 = dtclf3.fit(DTX_train, DTY_train)

In [69]:
y_pred= dtclf3.predict(DTX_test)

In [70]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(DTY_test, y_pred)
print(confusion_matrix)

[[10  1]
 [ 0 13]]


In [72]:
score= dtclf3.score(DTX_test, DTY_test)
score

0.9583333333333334

# Naive Bayes

In [88]:
DataX = df[['IncomeKs', 'Sqft']]
DataY = df[['SecuritySystemOwner']]

In [95]:
NBX_train, NBX_test, NBY_train, NBY_test = train_test_split(DataX, DataY, test_size=0.5, random_state=100)

In [96]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

In [97]:
#Create a Classifier and Train the model
BERNmodel = BernoulliNB()
#GAUSSmodel = GaussianNB()

# If the other variables were continuous, I would use GaussianNB()
BERNmodel.fit(NBX_train, np.ravel(NBY_train))
#GAUSSmodel.fit(NBX_train, np.ravel(NBY_train))

BernoulliNB()

In [98]:
from sklearn.metrics import confusion_matrix

#Predict Bernoulli Output 
NBERNY_pred = BERNmodel.predict(NBX_test)
#NGaussY_pred = GAUSSmodel.predict(NBX_test)

NBERNConfusion_Matrix = confusion_matrix(NBY_test, NBERNY_pred)
#NGaussConfusion_Matrix = confusion_matrix(NBY_test, NGaussY_pred)

print(NBERNConfusion_Matrix)


[[11  0]
 [13  0]]


In [99]:
BERNmodel.score(NBX_test, NBY_test)


0.4583333333333333

# KNN

In [74]:
DataX = df[['IncomeKs','Sqft']]
DataY = df[['SecuritySystemOwner']]

In [77]:
normalized_DataX=(DataX-DataX.min())/(DataX.max()-DataX.min())

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(normalized_DataX, DataY, test_size=0.5, random_state=100)

In [84]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, np.ravel(y_train))

KNeighborsClassifier(n_neighbors=7)

In [85]:
knn.score(X_test, y_test)

0.9166666666666666

In [86]:
Y_pred = knn.predict(X_test)

In [87]:
from sklearn.metrics import confusion_matrix
Confusion_Matrix = confusion_matrix(y_test, Y_pred)
print(Confusion_Matrix)

[[10  1]
 [ 1 12]]


# Bank data

# Logistic Regression

In [101]:
dfbank = pd.read_csv("bank_promo.csv")
dfbank.shape

(4521, 7)

In [102]:
dfbank.corr(method = 'spearman')

Unnamed: 0,job_mgmt,married,ps_edu,hous_loan,pers_loan,priorcontacts,subscribed
job_mgmt,1.0,-0.047153,0.586132,-0.04946,-0.042099,0.047123,0.038234
married,-0.047153,1.0,-0.107669,0.038432,0.032272,0.017459,-0.057971
ps_edu,0.586132,-0.107669,1.0,-0.098624,-0.043434,0.036932,0.058327
hous_loan,-0.04946,0.038432,-0.098624,1.0,0.018451,-0.007962,-0.106946
pers_loan,-0.042099,0.032272,-0.043434,0.018451,1.0,-0.013742,-0.06875
priorcontacts,0.047123,0.017459,0.036932,-0.007962,-0.013742,1.0,0.025403
subscribed,0.038234,-0.057971,0.058327,-0.106946,-0.06875,0.025403,1.0


In [103]:
Bank_DataX = dfbank[['job_mgmt','married','ps_edu','hous_loan','pers_loan', 'priorcontacts']]
Bank_DataY = dfbank[['subscribed']]

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split  
from sklearn import metrics

BankX_train, BankX_test, BankY_train, BankY_test = train_test_split(Bank_DataX, Bank_DataY, test_size=0.5, random_state=50)
bank_logisticreg = LogisticRegression(solver='lbfgs')
bank_logisticreg.fit(BankX_train, np.ravel(BankY_train))

LogisticRegression()

In [107]:
BankY_pred= bank_logisticreg.predict(BankX_test)

In [109]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(BankY_test, BankY_pred)
print(confusion_matrix)

[[1978    0]
 [ 283    0]]


In [110]:
bank_logisticreg.score(BankX_test, BankY_test)

0.8748341441839894

# Decision Tree

In [111]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

In [112]:
DTDataX = dfbank[['job_mgmt','married','ps_edu','hous_loan','pers_loan','priorcontacts']]
DTDataY = dfbank[['subscribed']]

dtclf2 = tree.DecisionTreeClassifier()
dtclf2 = dtclf2.fit(DTDataX, DTDataY)

In [113]:
BPromo_feature_names = ['job_mgmt','married','ps_edu','hous_loan','pers_loan','priorcontacts']
BPromo_target_names = ['NotSub','Sub']

In [114]:
DTX_train, DTX_test, DTY_train, DTY_test = train_test_split(DTDataX, DTDataY, test_size=0.5, random_state=50)

In [115]:
dtclf3 = tree.DecisionTreeClassifier()
dtclf3 = dtclf3.fit(DTX_train, DTY_train)

In [116]:
DTY_pred = dtclf3.predict(DTX_test)

In [117]:
dtclf3.score(DTX_test,DTY_test)


0.8646616541353384

In [118]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(DTY_test, DTY_pred)
print(confusion_matrix)

[[1950   28]
 [ 278    5]]


# Naive Bayes

In [120]:
NBDataX = dfbank[['job_mgmt','married','ps_edu','hous_loan','pers_loan','priorcontacts']]
NBDataY = dfbank[['subscribed']]

In [121]:
NBX_train, NBX_test, NBY_train, NBY_test = train_test_split(NBDataX, NBDataY, test_size=0.5, random_state=50)

In [122]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

In [123]:
#Create a Classifier and Train the model
BERNmodel = BernoulliNB()
#GAUSSmodel = GaussianNB()

# If the other variables were continuous, I would use GaussianNB()
BERNmodel.fit(NBX_train, np.ravel(NBY_train))
#GAUSSmodel.fit(NBX_train, np.ravel(NBY_train))

BernoulliNB()

In [124]:
from sklearn.metrics import confusion_matrix

#Predict Bernoulli Output 
NBERNY_pred = BERNmodel.predict(NBX_test)
#NGaussY_pred = GAUSSmodel.predict(NBX_test)

NBERNConfusion_Matrix = confusion_matrix(NBY_test, NBERNY_pred)
#NGaussConfusion_Matrix = confusion_matrix(NBY_test, NGaussY_pred)

print(NBERNConfusion_Matrix)
#print(NGaussConfusion_Matrix)

[[1978    0]
 [ 283    0]]


In [125]:
BERNmodel.score(NBX_test, NBY_test)
#GAUSSmodel.score(NBX_test, NBY_test)

0.8748341441839894

# KNN

In [139]:
NNDataX = dfbank[['job_mgmt','married','ps_edu','hous_loan','pers_loan','priorcontacts']]
NNDataY = dfbank[['subscribed']]

In [140]:
normalized_NNDataX=(NNDataX-NNDataX.min())/(NNDataX.max()-NNDataX.min())

In [141]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(normalized_NNDataX, NNDataY, test_size=0.5, random_state=50)

In [142]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 47)
knn.fit(X_train, np.ravel(y_train))

KNeighborsClassifier(n_neighbors=47)

In [143]:
knn.score(X_test, y_test)

0.8748341441839894

In [144]:
Y_pred = knn.predict(X_test)

In [145]:
from sklearn.metrics import confusion_matrix
Confusion_Matrix = confusion_matrix(y_test, Y_pred)
print(Confusion_Matrix)

[[1978    0]
 [ 283    0]]
