## Importing the essential libraries over here

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Importing the dataset over here

In [4]:
data=pd.read_csv("bank.csv")

In [5]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


## Taking care of duplicate observations if present over here

In [6]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [7]:
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

## Filtering all the numerical features over here

In [8]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
    print(feature)

age
balance
day
duration
campaign
pdays
previous


In [9]:
data[numerical_features]

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,59,2343,5,1042,1,-1,0
1,56,45,5,1467,1,-1,0
2,41,1270,5,1389,1,-1,0
3,55,2476,5,579,1,-1,0
4,54,184,5,673,2,-1,0
...,...,...,...,...,...,...,...
11157,33,1,20,257,1,-1,0
11158,39,733,16,83,4,-1,0
11159,32,29,19,156,2,-1,0
11160,43,0,8,9,2,172,5


## Filtering all the categorial features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

job
marital
education
default
housing
loan
contact
month
poutcome
deposit


In [11]:
data[cat_features]

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit
0,admin.,married,secondary,no,yes,no,unknown,may,unknown,yes
1,admin.,married,secondary,no,no,no,unknown,may,unknown,yes
2,technician,married,secondary,no,yes,no,unknown,may,unknown,yes
3,services,married,secondary,no,yes,no,unknown,may,unknown,yes
4,admin.,married,tertiary,no,no,no,unknown,may,unknown,yes
...,...,...,...,...,...,...,...,...,...,...
11157,blue-collar,single,primary,no,yes,no,cellular,apr,unknown,no
11158,services,married,secondary,no,no,no,unknown,jun,unknown,no
11159,technician,single,secondary,no,no,no,cellular,aug,unknown,no
11160,technician,married,secondary,no,no,yes,cellular,may,failure,no


## Encoding the categorial features over here

In [12]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [13]:
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,0,0,0,2343,0,0,0,5,0,1042,1,-1,0,0,0
1,56,0,0,0,0,45,1,0,0,5,0,1467,1,-1,0,0,0
2,41,1,0,0,0,1270,0,0,0,5,0,1389,1,-1,0,0,0
3,55,2,0,0,0,2476,0,0,0,5,0,579,1,-1,0,0,0
4,54,0,0,1,0,184,1,0,0,5,0,673,2,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,5,1,2,0,1,0,0,1,20,10,257,1,-1,0,0,1
11158,39,2,0,0,0,733,1,0,0,16,1,83,4,-1,0,0,1
11159,32,1,1,0,0,29,1,0,1,19,3,156,2,-1,0,0,1
11160,43,1,0,0,0,0,1,1,1,8,0,9,2,172,5,2,1


## Creating the features and labels over here

In [14]:
data['Loan']=data['loan']

In [15]:
data.drop("loan",axis=1,inplace=True)

In [16]:
data['Loan'].value_counts()

Loan
0    9702
1    1460
Name: count, dtype: int64

In [17]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing over here to avoid the problem of overfitting over here

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Experinment with different machine learning algorithms and selecting the one giving more accuracy to us with respect to performance metrics

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0),
    # "Support Vector Machine": SVC(kernel='linear'),
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    # "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

best_classifier = max(results, key=results.get)
best_accuracy = results[best_classifier]

print("Best Classifier:", best_classifier)
print("Accuracy:", best_accuracy)

Best Classifier: Logistic Regression
Accuracy: 0.8741603224361845


In [20]:
classifier=LogisticRegression()
classifier.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [21]:
y_pred=classifier.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 1]
 [0 1]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Checking the performance of the model using the following metrics

In [22]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[1952    0]
 [ 281    0]]


0.8741603224361845