In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [2]:
#lets import dataset
bank_df=pd.read_csv('Marketing_Analysis.csv')
bank_df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [3]:
yes_df=bank_df[bank_df['y']=='yes']
yes_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
83,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
86,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
87,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
129,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
168,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [9]:
success_df=bank_df[bank_df['poutcome']=='success']
success_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
24080,56,technician,married,secondary,no,589,yes,no,unknown,23,oct,518,1,147,2,success,yes
24165,30,admin.,married,secondary,no,873,yes,no,telephone,12,nov,119,1,167,3,success,no
24239,48,admin.,divorced,secondary,no,295,yes,no,cellular,17,nov,123,1,164,2,success,no
24264,49,management,married,tertiary,no,64,no,no,cellular,17,nov,208,1,159,1,success,no
24435,42,technician,married,tertiary,no,14282,yes,no,cellular,17,nov,77,1,103,4,success,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45190,32,blue-collar,married,secondary,no,136,no,no,cellular,16,nov,206,1,188,3,success,yes
45193,28,self-employed,single,tertiary,no,159,no,no,cellular,16,nov,449,2,33,4,success,yes
45195,68,retired,married,secondary,no,1146,no,no,cellular,16,nov,212,1,187,6,success,yes
45201,53,management,married,tertiary,no,583,no,no,cellular,17,nov,226,1,184,4,success,yes


#### Dataset Description:
<br>age : age of a customer
<br>job : type of job
<br>marital : marital status 
<br>education: education of customer
<br>default : has credit in default? (yes or no)
<br>balance : balance amount
<br>housing : has housing loan? (yes or no)
<br>loan : has a personal loan? (yes or no)
<br>contact : contact communication type (cellular or telephone)
<br>day : day of last contact
<br>month : month of last contact
<br>duration : last contact duration in seconds
<br>campaign : number of times a customer was contacted during the campaign
<br>pdays : number of days passed after the customer was last contacted from a previous campaign 
<br>previous : number of times the customer was contacted prior to (or before) this campaign
<br>poutcome : outcome of the previous marketing campaign
<br>y(target variable) : will the customer subscribe a term deposit?

In [4]:
# lets check the shape of the dataset
bank_df.shape

(45211, 17)

In [5]:
# lets check for null values
bank_df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

Hence, No null values present in the dataset

In [6]:
#lets check the datatypes of df
bank_df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [7]:
# categorical features
categorical_features=['job','marital','education','default','housing','loan','contact','month','poutcome']

In [8]:
# lets check the value counts for each categorical feature
for col in bank_df[categorical_features]:
    print(bank_df[col].value_counts())
    print('-'*75)

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64
---------------------------------------------------------------------------
married     27214
single      12790
divorced     5207
Name: marital, dtype: int64
---------------------------------------------------------------------------
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64
---------------------------------------------------------------------------
no     44396
yes      815
Name: default, dtype: int64
---------------------------------------------------------------------------
yes    25130
no     20081
Name: housing, dtype: int64
---------------------------------------------------------------------------
no     37967
yes     7244
Name: loan

In [9]:
# function to encode categorical variables
def Encode(df,variable):
    encoded_Variable = df[variable].value_counts().to_dict()
    df[variable] = df[variable].map(encoded_Variable)

In [10]:
for col in bank_df[categorical_features]:
    Encode(bank_df,col)

In [11]:
# lets check the dataset to check if categorical encoding happened or not
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,9458,27214,13301,44396,2143,25130,37967,13020,5,13766,261,1,-1,0,36959,no
1,44,7597,12790,23202,44396,29,25130,37967,13020,5,13766,151,1,-1,0,36959,no
2,33,1487,27214,23202,44396,2,25130,7244,13020,5,13766,76,1,-1,0,36959,no
3,47,9732,27214,1857,44396,1506,25130,37967,13020,5,13766,92,1,-1,0,36959,no
4,33,288,12790,1857,44396,1,20081,37967,13020,5,13766,198,1,-1,0,36959,no


In [12]:
#lets check the value count of y
bank_df['y'].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [13]:
# now lets label encode y output
label_map={'yes': 1, 'no': 0}
bank_df['y']=bank_df['y'].map(label_map)

In [14]:
#lets recheck the value count of y
bank_df['y'].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [15]:
# then the final processed dataset:
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,9458,27214,13301,44396,2143,25130,37967,13020,5,13766,261,1,-1,0,36959,0
1,44,7597,12790,23202,44396,29,25130,37967,13020,5,13766,151,1,-1,0,36959,0
2,33,1487,27214,23202,44396,2,25130,7244,13020,5,13766,76,1,-1,0,36959,0
3,47,9732,27214,1857,44396,1506,25130,37967,13020,5,13766,92,1,-1,0,36959,0
4,33,288,12790,1857,44396,1,20081,37967,13020,5,13766,198,1,-1,0,36959,0


In [16]:
bank_df.to_csv('processed_data.csv',index=False)

In [17]:
final_df=pd.read_csv('processed_data2.csv')

### Lets check with some EDA libraries and do some comparision to find the suitable model

### Pycaret:

In [18]:
from pycaret.classification import *
clf1=setup(data=final_df,target='y')

Unnamed: 0,Description,Value
0,session_id,1619
1,Target,y
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(45211, 16)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,9
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [19]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.906,0.9332,0.4857,0.6362,0.5507,0.4992,0.505,0.199
gbc,Gradient Boosting Classifier,0.9049,0.924,0.4096,0.6599,0.5051,0.4557,0.4717,1.859
rf,Random Forest Classifier,0.9013,0.9266,0.3692,0.6482,0.4702,0.4203,0.4407,1.18
lr,Logistic Regression,0.8991,0.9032,0.3388,0.6434,0.4435,0.3939,0.4187,3.222
lda,Linear Discriminant Analysis,0.899,0.9058,0.4384,0.6026,0.5072,0.4525,0.4598,0.203
ada,Ada Boost Classifier,0.8983,0.9094,0.3748,0.6186,0.4664,0.414,0.43,0.494
ridge,Ridge Classifier,0.8978,0.0,0.2805,0.6664,0.3945,0.3487,0.3881,0.036
et,Extra Trees Classifier,0.8974,0.9064,0.3338,0.6274,0.4354,0.3848,0.4082,1.388
dt,Decision Tree Classifier,0.8743,0.7026,0.4775,0.4712,0.4742,0.4028,0.4029,0.178
nb,Naive Bayes,0.856,0.8206,0.5222,0.4157,0.4627,0.3809,0.3843,0.737


In [20]:
print(best_model)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1619, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [21]:
#lets create model object for the best model
#lgbm_model=create_model('lightgbm')

In [22]:
#lets check feature importance using this model
#plot_model(lgbm_model,plot='feature')

In [23]:
# lets check confusion matrix
#plot_model(lgbm_model, plot = 'confusion_matrix')

In [24]:
# another way to evaluate model is:
#evaluate_model(lgbm_model)

#### we can check prediction on new data using this approach
<br>unseen_predictions = predict_model(lgbm_model, data=data_unseen)
<br>unseen_predictions.head()

### saving the model:
<br>save_model( lgbm_model,'lgbm_model')
<br>here we are not saving the model, as the model pickle file comes with pyret dependencies, i faced issue during flask deployment.

In [25]:
top12=compare_models(n_select=13)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.906,0.9332,0.4857,0.6362,0.5507,0.4992,0.505,0.185
gbc,Gradient Boosting Classifier,0.9049,0.924,0.4096,0.6599,0.5051,0.4557,0.4717,1.666
rf,Random Forest Classifier,0.9013,0.9266,0.3692,0.6482,0.4702,0.4203,0.4407,1.102
lr,Logistic Regression,0.8991,0.9032,0.3388,0.6434,0.4435,0.3939,0.4187,2.634
lda,Linear Discriminant Analysis,0.899,0.9058,0.4384,0.6026,0.5072,0.4525,0.4598,0.191
ada,Ada Boost Classifier,0.8983,0.9094,0.3748,0.6186,0.4664,0.414,0.43,0.467
ridge,Ridge Classifier,0.8978,0.0,0.2805,0.6664,0.3945,0.3487,0.3881,0.037
et,Extra Trees Classifier,0.8974,0.9064,0.3338,0.6274,0.4354,0.3848,0.4082,1.327
dt,Decision Tree Classifier,0.8743,0.7026,0.4775,0.4712,0.4742,0.4028,0.4029,0.123
nb,Naive Bayes,0.856,0.8206,0.5222,0.4157,0.4627,0.3809,0.3843,0.051


In [26]:
print(top12)

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1619, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, random_state=1619,
                          