1. Data Preparation

In [1]:
# Loading the libraries
from pandas import read_csv, get_dummies, DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


In [2]:
my_dataset=read_csv('/content/bank.csv') #reading the working csv file

In [3]:
my_dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [4]:
my_dataset.info() #this code is defining data type of column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [5]:
#categorical feature encoding using map function
# converting categorical data to numerical data

my_dataset['default']=my_dataset['default'].map({'yes':1,'no':0})
my_dataset['housing']=my_dataset['housing'].map({'yes':1,'no':0})
my_dataset['loan']=my_dataset['loan'].map({'yes':1,'no':0})
my_dataset['y']=my_dataset['y'].map({'yes':1,'no':0})

In [6]:
# now we need to use get dummies function to all objects (Data type) - this will covert all unique rows in diffirent column and than covert categorical data to numerical data
# we will not use get dummies function to int64 data type as int is already in numeric format and float
data10=get_dummies(my_dataset,['job','marital','education','contact','month','poutcome'],dtype=int)

In [7]:
data10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   default              4521 non-null   int64
 2   balance              4521 non-null   int64
 3   housing              4521 non-null   int64
 4   loan                 4521 non-null   int64
 5   day                  4521 non-null   int64
 6   duration             4521 non-null   int64
 7   campaign             4521 non-null   int64
 8   pdays                4521 non-null   int64
 9   previous             4521 non-null   int64
 10  y                    4521 non-null   int64
 11  job_admin.           4521 non-null   int64
 12  job_blue-collar      4521 non-null   int64
 13  job_entrepreneur     4521 non-null   int64
 14  job_housemaid        4521 non-null   int64
 15  job_management       4521 non-null   int64
 16  job_retired          452

In [8]:
y=data10['y']   #"y" is dependent variable
x=data10.drop('y',axis=1) # we will drop "y" targets value and define x as independent(to all other column)
print(y.shape)
print(x.shape)

(4521,)
(4521, 48)


## Data Scaling



In [9]:
# Scale the features using StandardScaler and convert to a DataFrame.
x_scaled=StandardScaler().fit_transform(x)
DataFrame(x_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,-1.056270,-0.130759,0.121072,-1.142051,-0.424756,0.374052,-0.711861,-0.576829,-0.407218,-0.320413,...,-0.364805,-0.104676,-0.669064,-0.306828,7.450671,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
1,-0.772583,-0.130759,1.118644,0.875617,2.354292,-0.596026,-0.169194,-0.576829,2.989044,2.041734,...,-0.364805,-0.104676,1.494626,-0.306828,-0.134216,-0.107869,2.868193,-0.213447,-0.171381,-2.130831
2,-0.583458,-0.130759,-0.024144,0.875617,-0.424756,0.010273,-0.303898,-0.576829,2.899143,0.270124,...,-0.364805,-0.104676,-0.669064,-0.306828,-0.134216,-0.107869,2.868193,-0.213447,-0.171381,-2.130831
3,-1.056270,-0.130759,0.017726,0.875617,2.354292,-1.566105,-0.250017,0.387967,-0.407218,-0.320413,...,2.741190,-0.104676,-0.669064,-0.306828,-0.134216,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
4,1.686036,-0.130759,-0.472753,0.875617,-0.424756,-1.323585,-0.146102,-0.576829,-0.407218,-0.320413,...,-0.364805,-0.104676,1.494626,-0.306828,-0.134216,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,-0.772583,-0.130759,-0.583410,0.875617,-0.424756,1.707910,0.250315,0.709566,-0.407218,-0.320413,...,-0.364805,-0.104676,-0.669064,-0.306828,-0.134216,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
4517,1.496912,7.647669,-1.573671,0.875617,2.354292,-0.838546,-0.427057,-0.576829,-0.407218,-0.320413,...,-0.364805,-0.104676,1.494626,-0.306828,-0.134216,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
4518,1.496912,-0.130759,-0.374724,-1.142051,-0.424756,0.374052,-0.434754,2.639160,-0.407218,-0.320413,...,-0.364805,-0.104676,-0.669064,-0.306828,-0.134216,-0.107869,-0.348652,-0.213447,-0.171381,0.469300
4519,-1.245394,-0.130759,-0.094925,-1.142051,-0.424756,-1.202326,-0.519426,0.387967,1.710451,1.451197,...,-0.364805,-0.104676,-0.669064,-0.306828,-0.134216,-0.107869,-0.348652,4.685001,-0.171381,-2.130831


## Data Splitting

In [10]:
# Split the scaled data into training and testing sets, then print their shapes. (30% testing set and 70% trainings set)
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y, test_size=0.30, random_state=40)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3164, 48)
(1357, 48)
(3164,)
(1357,)


## Data Balancing



In [11]:
 # Apply SMOTE to balance the training data and then display the class counts.
x_train,y_train =SMOTE (random_state = 40).fit_resample(x_train,y_train)
y_train.value_counts()

Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
1,2811
0,2811


## Random Forest Classifier

In [12]:

from sklearn import ensemble
from sklearn import metrics
RF_classifier11 =ensemble.RandomForestClassifier(n_estimators=50, criterion='entropy', random_state=40)  # building model
RF_classifier11.fit(x_train,y_train)#training
y_pred1=RF_classifier11.predict(x_test)# testing (predictions on the test set)

accuracy = metrics.accuracy_score(y_test, y_pred1)  # Calculating accuracy
print("Accuracy: ", round(accuracy, 2))

recall = metrics.recall_score(y_test, y_pred1) # Calculate recall.
print("Recal: ", round(recall, 2))

precision = metrics.precision_score(y_test, y_pred1) # Calculate precision.
print("Precision: ", round(precision, 2))

Accuracy:  0.88
Recal:  0.32
Precision:  0.52


**Hyper parameter tunning**

In [13]:
from sklearn.model_selection import GridSearchCV
RF_classifier2 = ensemble.RandomForestClassifier(criterion='entropy', random_state=40) # building model
no_trees = {'n_estimators': [50,60,65,70,77,98,106],
            'max_features': ['sqrt', 'log2']}
grid_search1 = GridSearchCV(estimator=RF_classifier2, param_grid=no_trees, scoring='recall', cv=4) # using GridSearchCV to find the best hyperparameters.
grid_search1.fit(x_scaled, y)## training, testing , evaluation, ranking.
best_parameters = grid_search1.best_params_
print(best_parameters)

{'max_features': 'sqrt', 'n_estimators': 65}


In [14]:
from sklearn import ensemble
import pandas as pd
from sklearn import metrics

RF_classifier11 =ensemble.RandomForestClassifier(n_estimators=65, criterion='entropy', max_features='sqrt', random_state=40)  # building model with best parameters
RF_classifier11.fit(x_train,y_train) # fitting random forest to training data
y_pred1=RF_classifier11.predict(x_test)# making predictions on testing set

imp_features = pd.Series(RF_classifier11.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

accuracy_RF = metrics.accuracy_score(y_test, y_pred1)  # Calculating accuracy
print("Accuracy: ", accuracy_RF)

recall_RF = metrics.recall_score(y_test, y_pred1) # Calculate recall
print("Recall:", recall_RF)

precision_RF = metrics.precision_score(y_test, y_pred1) # Calculate precision
print("Precision:", precision_RF)

duration               0.254100
balance                0.060712
campaign               0.053936
housing                0.052416
day                    0.046884
age                    0.046875
marital_married        0.036181
contact_cellular       0.035437
poutcome_success       0.025720
contact_unknown        0.024668
marital_single         0.024020
education_secondary    0.022177
education_tertiary     0.019885
pdays                  0.019432
previous               0.019015
month_may              0.017848
loan                   0.017486
job_management         0.017131
job_blue-collar        0.015411
education_primary      0.014914
month_jul              0.013370
month_aug              0.013222
poutcome_unknown       0.012957
marital_divorced       0.012535
job_technician         0.012434
month_jun              0.011055
month_apr              0.009188
job_admin.             0.009001
month_oct              0.008847
month_mar              0.008831
month_nov              0.008149
poutcome

In [15]:
num_top_features = int(len(imp_features) * 0.8) #consider top 80% of the features
top_features = imp_features.index[:num_top_features]

x_train = pd.DataFrame(x_train, columns=x.columns)
x_test = pd.DataFrame(x_test, columns=x.columns) # converting x_train and x_test to data frame with original name.

x_train_top = x_train[top_features]
x_test_top = x_test[top_features] # selecting only top 80% of the feature

RF_classifier12 = ensemble.RandomForestClassifier(n_estimators=65, criterion='entropy', max_features='sqrt', random_state=40) # bulding model with best parameter
RF_classifier12.fit(x_train_top, y_train) #fitting new model
y_pred2 = RF_classifier12.predict(x_test_top) # doing prediction on test set

print("Original Model with all feature:")   # this will print and round Accuracy, recall and precision with 100% of the features
print("Accuracy:", round(metrics.accuracy_score(y_test, y_pred1), 2))
print("Recall:", round(metrics.recall_score(y_test, y_pred1), 2))
print("Precision:", round(metrics.precision_score(y_test, y_pred1), 2))

print("\nModel with important feature:") # this will print and round Accuracy, recall and precision with only 80% of the features
print("Accuracy:", round(metrics.accuracy_score(y_test, y_pred2), 2))
print("Recall:", round(metrics.recall_score(y_test, y_pred2), 2))
print("Precision:", round(metrics.precision_score(y_test, y_pred2), 2))

Original Model with all feature:
Accuracy: 0.88
Recall: 0.34
Precision: 0.53

Model with important feature:
Accuracy: 0.88
Recall: 0.39
Precision: 0.53


# Support Vector Machine (SVM)

In [16]:
# Now we will use SVM model for our prediction
from sklearn import svm
SV_classifier1 = svm.SVC(random_state = 40) # building SVM classifier
SV_classifier1.fit(x_train, y_train) # training
Y_pred1= SV_classifier1.predict(x_test)  #testing

In [None]:
from sklearn import metrics
Accuracy=metrics.accuracy_score(y_test, y_pred1) # Calculating accuracy
print("Accuracy: ", round(Accuracy, 2))

recall = metrics.recall_score(y_test, y_pred1) # Calculate recall
print ("Recall: ", round(recall,2))

precision = metrics.precision_score(y_test, y_pred1) # Calculate precision
print ("Precision: ", round(precision,2))

Accuracy:  0.88
Recall:  0.34
Precision:  0.53


**hyper parameter tunning**

In [18]:
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
SVM_classifier2 = Pipeline([('balancing', SMOTE(random_state = 40)),('classification', SVC(random_state = 40) ) ]) # bulilding SVM classifier with pipeline and SVC

kernels_c = {'classification__kernel': ['linear','poly','rbf','sigmoid'], 'classification__C': [.001,0.01,0.1,0.5,1,2]} # Defining the parameter grid for GridSearchCV.
grid_search1 = GridSearchCV(estimator=SVM_classifier2, param_grid=kernels_c, scoring='recall', cv=4) #applying GridSearchCV
grid_search1.fit(x_train,y_train)

best_parameters = grid_search1.best_params_ #getting the best parameters from GridSearchCV.
print(best_parameters)
best_result = grid_search1.best_score_ #Getting the best score
print(best_result)

{'classification__C': 2, 'classification__kernel': 'rbf'}
0.9708321682005892


In [19]:
SV_classifier3 = svm.SVC(kernel='rbf', C= 2)  # building SVM with best parameters
SV_classifier3.fit(x_train, y_train) # training
y_pred2= SV_classifier3.predict(x_test)  #testing

Accuracy=metrics.accuracy_score(y_test, y_pred2) # Calculating accuracy
print("Accuracy: ", round(Accuracy,2))

recall = metrics.recall_score(y_test, y_pred2) # Calculate recall
print ("Recall: ", round(recall,2))

precision = metrics.precision_score(y_test, y_pred2) # Calculate precision
print ("Precision: ", round(precision,2))

Accuracy:  0.86
Recall:  0.49
Precision:  0.44
