In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


In [8]:
data = pd.read_csv("/content/bank-additional-full (1).csv", sep=";")
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [9]:
x = data.drop('y', axis=1)
y = data['y']

In [10]:
y

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

In [11]:
numerical_columns = ['age', 'duration', 'campaign', 'pdays', 'previous',
                    'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                    'euribor3m', 'nr.employed']
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                       'contact', 'month', 'day_of_week', 'poutcome']

In [12]:
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(x[numerical_columns])

In [13]:
label_encoder = LabelEncoder()
categorical_features_encoded = x[categorical_columns].apply(label_encoder.fit_transform)
x_tot = pd.concat([pd.DataFrame(numerical_features_scaled, columns=numerical_columns),
                   categorical_features_encoded], axis=1)


In [14]:
imputer = SimpleImputer(strategy='mean')
x_tot_imputed = imputer.fit_transform(x_tot)
pd.DataFrame(x_tot_imputed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.533034,0.010471,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,3.0,1.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,1.0
1,1.628993,-0.421501,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,7.0,1.0,3.0,1.0,0.0,0.0,1.0,6.0,1.0,1.0
2,-0.290186,-0.124520,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,7.0,1.0,3.0,0.0,2.0,0.0,1.0,6.0,1.0,1.0
3,-0.002309,-0.413787,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,0.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,1.0
4,1.533034,0.187888,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,7.0,1.0,3.0,0.0,0.0,2.0,1.0,6.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,3.164336,0.292025,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,5.0,1.0,5.0,0.0,2.0,0.0,0.0,7.0,0.0,1.0
41184,0.573445,0.481012,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,1.0,1.0,5.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0
41185,1.533034,-0.267225,-0.204909,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,5.0,1.0,6.0,0.0,2.0,0.0,0.0,7.0,0.0,1.0
41186,0.381527,0.708569,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,9.0,1.0,5.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_tot_imputed,y,test_size=0.2,random_state=30)
model = LogisticRegression(max_iter = 4000)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
spec = cm[0][0]/(cm[0][0]+cm[0][1])
print('accuracy score :-',accuracy_score(y_test,y_pred))
print('recall score :-',recall_score(y_test,y_pred,average="micro"))
print('specification:- ',spec)
print('precision score:-',precision_score(y_test,y_pred,average="micro"))
print('f1 score:-',f1_score(y_test,y_pred,average="micro"))
print('kappa score:-',cohen_kappa_score(y_test,y_pred))

accuracy score :- 0.9093226511289147
recall score :- 0.9093226511289147
specification:-  0.9728729963008631
precision score:- 0.9093226511289147
f1 score:- 0.9093226511289147
kappa score:- 0.4637299238128403


In [16]:
pca = PCA(0.95)

In [17]:
x_pca = pca.fit_transform(x_tot_imputed)
x_pca_df=pd.DataFrame(x_pca)

In [18]:
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:\n", explained_variance_ratio)

Explained Variance Ratio:
 [0.35113001 0.15414334 0.12240711 0.09623991 0.05262172 0.03714219
 0.0299222  0.02816635 0.02589237 0.02493749 0.02269086 0.01400312]


In [19]:
final_data = pd.concat([x_pca_df, y], axis=1)
final_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,y
0,-1.261554,2.031108,-3.761506,-0.425514,1.077260,1.083170,-1.237474,0.503916,0.907486,-0.314432,-0.192019,-0.232762,no
1,3.068707,1.622807,-1.605798,-1.476298,1.102544,1.043597,-1.485091,0.159944,0.953563,-0.341964,-0.607735,-0.232115,no
2,3.081881,1.625340,-1.385487,-1.441261,1.032108,0.298056,-0.167309,0.388389,-1.049293,-0.802946,0.363611,-0.391677,no
3,-4.107952,1.628024,-2.426528,-0.926828,1.055133,0.638066,-0.249816,0.325734,0.916287,-1.062247,0.275540,-0.255134,no
4,3.068493,1.628289,-1.543446,-1.442051,1.081797,1.019646,-1.298132,0.619761,0.779434,-0.019197,-0.404948,1.761504,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1.274910,3.044397,1.666003,0.369772,2.230490,0.027101,-1.348103,0.109248,-0.803568,1.987461,-3.224815,-0.454285,yes
41184,-2.690449,2.802821,2.252156,0.102519,2.186571,-0.486266,0.605479,0.679527,1.116282,1.129898,-2.220394,-0.341017,no
41185,1.401811,2.806474,2.649709,-0.106945,2.245445,-0.495194,-0.227507,-0.385908,-0.755545,1.366337,-2.612255,-0.472303,no
41186,5.238781,3.294055,1.347853,0.258705,2.198644,-0.631790,0.822915,0.871601,1.110561,1.232805,-2.133153,-0.324106,yes


In [20]:
xf=final_data.iloc[:,:-1]
xf
yf=final_data.iloc[:,-1]
yf
x_train,x_test,y_train,y_test=train_test_split(xf,yf,test_size=0.2,random_state=0)
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
29321,3.315594,-2.947717,-0.603504,3.668085,1.898978,-1.053086,-0.039959,0.559545,-0.659909,1.080287,-0.251014,-0.416926
23925,6.644135,-3.574196,0.671213,-0.516067,1.853784,0.019235,-0.197734,-0.147311,-0.918862,-1.174664,0.572002,-0.384047
39148,-3.512435,1.249835,2.910116,1.582119,-1.171263,0.348074,-0.252047,-0.079209,-0.678300,-0.297740,0.860870,-0.437780
12078,-2.925483,-0.555460,-1.903509,-0.858396,2.037307,-0.099706,1.308233,0.090633,-1.104330,-1.320304,-0.181350,-0.439839
41021,5.965023,4.685242,-0.257545,0.684734,2.238893,-1.388865,1.791482,-0.164981,-0.802210,0.080971,-0.949346,-0.462824
...,...,...,...,...,...,...,...,...,...,...,...,...
20757,-3.291857,-4.087914,1.513038,-0.866082,-2.165985,0.838995,-1.489090,1.009535,1.031049,0.821534,-0.093618,-0.226600
32103,0.471388,1.758614,2.858552,0.530530,2.091817,-0.741604,-1.667397,0.010814,1.256132,0.569527,-1.614866,-0.236000
30403,-3.362815,-3.798015,2.101484,2.727823,-0.133053,-1.471609,1.103104,0.545059,1.334830,0.440075,0.166847,-0.283361
21243,-0.416285,-3.755867,0.608531,-0.281379,0.822408,0.025601,-0.136049,0.152031,-1.096100,-1.357055,0.641997,1.605178


In [21]:
model = LogisticRegression(max_iter = 4000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
spec = cm[0][0]/(cm[0][0]+cm[0][1])
print('accuracy score :-',accuracy_score(y_test,y_pred))
print('recall score :-',recall_score(y_test,y_pred,average="micro"))
print('specification:- ',spec)
print('precision score:-',precision_score(y_test,y_pred,average="micro"))
print('f1 score:-',f1_score(y_test,y_pred,average="micro"))
print('kappa score:-',cohen_kappa_score(y_test,y_pred))
# cm

accuracy score :- 0.9116290361738286
recall score :- 0.9116290361738286
specification:-  0.974723322858314
precision score:- 0.9116290361738286
f1 score:- 0.9116290361738286
kappa score:- 0.4626647751005679


In [25]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(15)

In [26]:
x_svd = svd.fit_transform(x_tot_imputed)
x_train_svd,x_test_svd,y_train,y_test = train_test_split(x_svd,yf,test_size=0.2,random_state=30)

In [27]:
model.fit(x_train_svd,y_train)
y_svd_pred = model.predict(x_test_svd)

In [28]:
cm = confusion_matrix(y_test,y_pred)
spec = cm[0][0]/cm[0][0]+cm[0][1]
print('accuracy score :-',accuracy_score(y_test,y_svd_pred))
print('recall score :-',recall_score(y_test,y_svd_pred,average="micro"))
print('specification:- ',spec)
print('precision score:-',precision_score(y_test,y_svd_pred,average="micro"))
print('f1 score:-',f1_score(y_test,y_svd_pred,average="micro"))
print('kappa score:-',cohen_kappa_score(y_test,y_svd_pred))

accuracy score :- 0.9093226511289147
recall score :- 0.9093226511289147
specification:-  494.0
precision score:- 0.9093226511289147
f1 score:- 0.9093226511289147
kappa score:- 0.4607411325460967


In [30]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
# n_components = min(19,0)
n_classes = len(np.unique(y))
n_components = min(19, n_classes - 1, x_tot_imputed.shape[1])
lda = LDA(n_components=n_components)


In [31]:
x_lda = lda.fit_transform(x_tot_imputed,y)
x_train_lda,x_test_lda,y_train,y_test = train_test_split(x_lda,y,test_size=0.2,random_state=30)


In [32]:
model.fit(x_train_lda,y_train)
y_pred_lda = model.predict(x_test_lda)

In [33]:
cm = confusion_matrix(y_test,y_pred)
spec = cm[0][0]/(cm[0][0]+cm[0][1])
print('accuracy score :-',accuracy_score(y_test,y_pred_lda))
print('recall score :-',recall_score(y_test,y_pred_lda,average="micro"))
print('specification:- ',spec)
print('precision score:-',precision_score(y_test,y_pred_lda,average="micro"))
print('f1 score:-',f1_score(y_test,y_pred_lda,average="micro"))
print('kappa score:-',cohen_kappa_score(y_test,y_pred_lda))

accuracy score :- 0.9068948773974266
recall score :- 0.9068948773974266
specification:-  0.932456500890533
precision score:- 0.9068948773974266
f1 score:- 0.9068948773974266
kappa score:- 0.43688797605724106
