In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline #Magic command to include plots in the notebook

import statsmodels.api as sm
from scipy import stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
#Banking Data Frame
banking_df = pd.read_csv("../Data/bank-full.csv")

-----------------------------------------Explorartory Data Analysis----------------------------------------------------

In [None]:
#Start EDA - Exploratory Data Analysis
number_records = banking_df.shape[0]
number_columns = banking_df.shape[1]

print ("Number of Records: ",number_records)
print ("Number of Columns: ",number_columns)

In [None]:
print(banking_df.head())

In [None]:
#Data Type of Each Column
print (banking_df.dtypes)


INSIGHT:
--Multiple features are of string data type, so we will have to perform transformation into appropriate data type.


In [None]:
# Missing Value Exploration
print (banking_df.info()) #Count of Null Object

# Converting string into categorical
for feature in banking_df.columns: 
    if banking_df[feature].dtype == 'object': 
        banking_df[feature] = pd.Categorical(banking_df[feature])

print (banking_df.info())

In [None]:
#Generating the Value Count
print(banking_df.job.value_counts())
print('\n',banking_df.marital.value_counts())
print('\n',banking_df.education.value_counts())
print('\n',banking_df.default.value_counts())
print('\n',banking_df.housing.value_counts())
print('\n',banking_df.loan.value_counts())
print('\n',banking_df.contact.value_counts())
print('\n',banking_df.month.value_counts())
print('\n',banking_df.poutcome.value_counts())

In [None]:
print (banking_df.isnull().sum())
print (banking_df.isnull().values.any())
print (banking_df.isna().any())

for column in banking_df.columns:
    print (column,": ",sum(banking_df[column] == "none"))

In [None]:
# Generating Descriptive Statistical Report 
banking_df_transpose = banking_df.describe().T
print (banking_df_transpose)

INSIGHT:
    - Spread is Very High
    - We might have outliers in the data
    - We shoudnt go with mean as missing value replacement technique.
    - Columns are on different scale, so we might have to perform scaling (Standarization/Normalization)

In [None]:
# Detecting Outliers
sns.boxplot(data=banking_df, orient="h", palette="Set2")

In [None]:
banking_df.boxplot(return_type='axes',figsize=(30,10))

In [None]:
column_list = []
iqr_list = []
out_low = []
out_up = []
tot_ou = []
for column in banking_df.describe().columns:
    QTR1 = banking_df.describe().at['25%', column]
    QTR3 = banking_df.describe().at['75%', column]
    IQR = QTR3-QTR1
    LTV = QTR1 - 1.5 * IQR # lower bound 
    UTV = QTR3 + 1.5 * IQR # upper bound
    current_column = column
    current_iqr = IQR
    outliers_bl_low_bount = banking_df[banking_df[column] < LTV][column].count()
    outliers_bl_up_bount = banking_df[banking_df[column] > UTV][column].count()
    total_num_of_outliers = outliers_bl_low_bount + outliers_bl_up_bount
    
    column_list.append(current_column)
    iqr_list.append(current_iqr)
    out_low.append(outliers_bl_low_bount)
    out_up.append(outliers_bl_up_bount)
    tot_ou.append(total_num_of_outliers)

outlier_report = {"Column Name":column_list,"IQR":iqr_list,"Below Outliers":out_low,"Above Outliers":out_up,"Total No Of Outliers":tot_ou}
outlier_report = pd.DataFrame(outlier_report)

print (outlier_report)
    

In [None]:
"""----------------------------------------Visualization-------------------------------------------------------------"""

In [None]:
sns.pairplot(banking_df)

In [None]:
print (banking_df.Target.value_counts())

In [None]:
# Impact of Age on Target
fig,ax1 = plt.subplots()

#Age
bins = range(0,100,10)
sns.distplot(banking_df.age[banking_df.Target=='yes'],color='r',bins=bins,label="Subscribed",ax=ax1,kde=False)
sns.distplot(banking_df.age[banking_df.Target=='no'],color='b',bins=bins,label="Not Subscribed",ax=ax1,kde=False)
plt.legend()

INSIGHT: Age might be one important parameter, especially in range of 20-60.

In [None]:
# Impact of Jobs on Target
fig,ax2 = plt.subplots()
sns.countplot(banking_df['job'], data = banking_df, hue = 'Target', ax = ax2)
sns.despine(ax = ax2)
ax2.set_xlabel('Job', fontsize=5)
ax2.set_ylabel('Occurence', fontsize=5)
ax2.set_title('Job x Ocucurence', fontsize=5)
ax2.tick_params(labelsize=15)
ax2.set_xticklabels(banking_df['job'], rotation=90)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout() 
plt.legend(title="Subscribers",labels=["Not Subscribed","Subscribed"])

INSIGHT: Few profiles are helpful for classification

--------------------------------------------------Start The Modellig Process----------------------------------------------------

In [None]:
# model_report_tracker = {"Algo Version":[],"Precision_Yes":[],"Precision_No":[],"F1_Yes":[],"F1_No":[],"Recall_Yes":[],"Recall_No":[]}

In [None]:
# print (model_report_tracker)

In [58]:
banking_sub_df = banking_df.iloc[:,[0,1,2,3,4,5,6,7,8,16]]
print (banking_sub_df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact Target  
0  unknown     no  
1  unknown     no  
2  unknown     no  
3  unknown     no  
4  unknown     no  


In [59]:
# Dummy Variable Creation
categorical_column = ['job','marital','education','default','housing','loan','contact']
banking_sub_df = pd.get_dummies(banking_sub_df,columns=categorical_column)

print (banking_sub_df.shape)
print (banking_sub_df.columns)

(45211, 31)
Index(['age', 'balance', 'Target', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_no', 'default_yes',
       'housing_no', 'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown'],
      dtype='object')


=================================Class Imbalance Treatment =====================================

In [None]:
"""
Down Sampling: 
    -- Reducing number of rows for majority class
"""
from sklearn.utils import resample


banking_df_majority = banking_sub_df[banking_sub_df.Target == "no"]
banking_df_minority = banking_sub_df[banking_sub_df.Target == "yes"]

# Downsampling Majority Class
# Downsample majority class
majority_down = resample(banking_df_majority, replace=False,
                                   n_samples=7000, random_state=123)
banking_down_sample_df = pd.concat([majority_down,banking_df_minority])

print (banking_down_sample_df.Target.value_counts())

Splitting the Data Set

In [49]:
#Block for undersampling & raw distribution

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = banking_down_sample_df.drop('Target',axis=1) #Input Data Set
Y = banking_down_sample_df[["Target"]] #Label or Outcome Column

# X = banking_sub_df.drop('Target',axis=1) #Input Data Set
# Y = banking_sub_df[["Target"]] #Label or Outcome Column

In [62]:
# Code for Oversampling

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

predictors = banking_df.iloc[:,0:16]
predictors = predictors.drop(['pdays'],axis=1)
Y = banking_df.iloc[:,16]
X = pd.get_dummies(predictors)





# Random Oversampling
from imblearn.over_sampling import RandomOverSampler, SMOTE

# over_sampler = RandomOverSampler(random_state=0)
# x_over, y_over = over_sampler.fit_resample(X, Y)

# X = x_over
# Y = y_over


smote = SMOTE(random_state=0)
x_smote, y_smote = smote.fit_resample(X, Y)
# pd.Series(y_SMOTE).value_counts()

print (pd.Series(y_smote).value_counts())

X = x_smote
Y = y_smote



yes    39922
no     39922
Name: Target, dtype: int64


In [63]:



x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=7)
# print (y_train.Target.value_counts())
# print (y_test.Target.value_counts())
# print('x train data: ',x_train.shape)
# print('y train data:',y_train.shape)
# print('x test data : ',x_test.shape)
# print('y test data :',y_test.shape)

In [51]:

# pd.Series(y_Osampled).value_counts()

In [64]:
#Lets apply scaling (Standarization or Normalization)
x_train_scaled = preprocessing.scale(x_train)
x_test_scaled = preprocessing.scale(x_test)

x_train = x_train_scaled
x_test  = x_test_scaled

In [65]:
print (x_train)
print (x_test)

[[-0.35079058 -0.43600533 -1.30611109 ... -0.15581397 -0.19992672
   0.73318721]
 [-1.35172555 -0.40018505  0.73060211 ... -0.15581397 -0.19992672
   0.73318721]
 [-0.35079058 -0.00736606  0.60330753 ... -0.15581397 -0.19992672
   0.73318721]
 ...
 [ 0.01318577 -0.4239649  -0.66963822 ... -0.15581397  5.00183266
  -1.36390813]
 [-0.53277875 -0.47363167 -0.79693279 ... -0.15581397 -0.19992672
  -1.36390813]
 [-0.71476693 -0.26473023 -0.79693279 ... -0.15581397 -0.19992672
   0.73318721]]
[[ 0.37723135  0.0308381  -0.03590778 ... -0.1615163  -0.20449668
   0.74370862]
 [-0.79864361 -0.5116049  -1.18298538 ... -0.1615163  -0.20449668
  -1.34461263]
 [-0.61773977 -0.43814183  2.00334128 ... -0.1615163  -0.20449668
   0.74370862]
 ...
 [ 0.55813519  0.94860432 -0.29081391 ... -0.1615163  -0.20449668
   0.74370862]
 [ 3.81440431 -0.50707874  1.23862288 ... -0.1615163  -0.20449668
   0.74370862]
 [-1.25090321 -0.41585919  0.85626368 ... -0.1615163  -0.20449668
  -1.34461263]]


-----------------------------------------------------Logistic Regression-----------------------------------------------------

In [66]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn import preprocessing
#Prepare for cross validation
seed = 10
kfold = model_selection.KFold(n_splits=10, random_state=seed)
LogReg = LogisticRegression(solver = 'lbfgs')
LogReg.fit(x_train, y_train)

# Predicting for test set
y_pred               = LogReg.predict(x_test)
# train_accuracy = accuracy_score(y_train,train_pred)
# test_accuracy = accuracy_score(y_test,y_pred)

# print (train_accuracy,test_accuracy)

cross_validation_result = model_selection.cross_val_score(LogReg, x_train, y_train, cv=kfold, scoring='accuracy')
print(cross_validation_result)

cls_report = classification_report(y_test, y_pred,output_dict=True)
print (cls_report)

print (classification_report(y_test,y_pred))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[0.94160927 0.94693175 0.94646212 0.94082655 0.93988729 0.94880225
 0.94003444 0.94066072 0.94770628 0.94817598]
{'no': {'precision': 0.9141575156452946, 'recall': 0.9767852636891244, 'f1-score': 0.9444342787435193, 'support': 7926}, 'yes': {'precision': 0.9754666666666667, 'recall': 0.9096108417257243, 'f1-score': 0.9413884063565592, 'support': 8043}, 'accuracy': 0.9429519694407915, 'macro avg': {'precision': 0.9448120911559806, 'recall': 0.9431980527074244, 'f1-score': 0.9429113425500393, 'support': 15969}, 'weighted avg': {'precision': 0.9450366878955856, 'recall': 0.9429519694407915, 'f1-score': 0.9429001844603255, 'support': 15969}}
              precision    recall  f1-score   support

          no       0.91      0.98      0.94      7926
         yes       0.98      0.91      0.94      8043

    accuracy                           0.94     15969
   macro avg       0.94      0.94      0.94     15969
weighted avg       0.95      0.94      0.94     15969



In [None]:
def insert_report(model_report_tracker,cls_report,algo_version):
    print (model_report_tracker)
    no_precision = cls_report["no"]["precision"]
    no_recall = cls_report["no"]["recall"]
    no_f1 = cls_report["no"]["f1-score"]

    yes_precision = cls_report["yes"]["precision"]
    yes_recall = cls_report["yes"]["recall"]
    yes_f1 = cls_report["yes"]["f1-score"]

    # model_report_tracker = {"Algo Version":[],"Precision_Yes":[],"Precision_No":[],"F1_Yes":[],"F1_No":[]}

    current_algo = model_report_tracker["Algo Version"]
    current_p_yes = model_report_tracker["Precision_Yes"]
    current_p_no = model_report_tracker["Precision_No"]
    current_r_yes = model_report_tracker["Recall_Yes"]
    current_r_no = model_report_tracker["Recall_No"]
    current_f_yes  = model_report_tracker["F1_Yes"]
    current_f_no = model_report_tracker["F1_No"]
    
    print (current_algo)
    print (current_p_yes)
    print (current_p_no)

    model_report_tracker["Algo Version"] = current_algo.append(algo_version)
    model_report_tracker["Precision_Yes"] = current_p_yes.append(yes_precision)
    model_report_tracker["Precision_No"] = current_p_no.append(no_precision)
    model_report_tracker["Recall_Yes"] = current_r_yes.append(yes_recall)
    model_report_tracker["Recall_No"] = current_r_no.append(no_recall)
    model_report_tracker["F1_Yes"] = current_f_yes.append(yes_f1)
    model_report_tracker["F1_No"] = current_f_no.append(no_f1)
    
    return model_report_tracker

In [None]:
# print (model_report_tracker)
# report = insert_report(model_report_tracker,cls_report,"Logistic Raw Data")



In [None]:
# print (report)

==================================================Decision Tree===============================

In [67]:
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'entropy', random_state=1)
dTree.fit(x_train, y_train)

# Predicting for test set
y_pred               = dTree.predict(x_test)

# dTree_ScoreAccuracy        = accuracy_score(y_test, dTree_y_pred)
# dTree_PrecisonScore        = precision_score(y_test, dTree_y_pred)
# dTree_RecollScore          = recall_score(y_test, dTree_y_pred)
# dTree_F1                   = f1_score(y_test, dTree_y_pred)

cross_validation_result = model_selection.cross_val_score(dTree, x_train, y_train, cv=kfold, scoring='accuracy')
print (cross_validation_result)
# dTree_models_results = pd.DataFrame([['Decision Tree ', dTree_ScoreAccuracy, dTree_PrecisonScore,
#                                 dTree_RecollScore, dTree_F1, cross_validation_result.mean(), cross_validation_result.std()]], 
#                               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Mean', 'Std Deviation'])
# base_model_results = base_model_results.append(dTree_models_results, ignore_index = True)
# print(dTree.score(X_train, y_train))
# print(dTree.score(X_test, y_test))

print (classification_report(y_test, y_pred,output_dict = True))
print (classification_report(y_test,y_pred))

[0.92454602 0.92924233 0.92924233 0.93409518 0.92579837 0.9304838
 0.93126664 0.93111007 0.93220604 0.93251918]
{'no': {'precision': 0.9259259259259259, 'recall': 0.9336361342417361, 'f1-score': 0.9297650458600327, 'support': 7926}, 'yes': {'precision': 0.9340604237181898, 'recall': 0.9263956235235609, 'f1-score': 0.9302122347066167, 'support': 8043}, 'accuracy': 0.9299893543740998, 'macro avg': {'precision': 0.9299931748220578, 'recall': 0.9300158788826485, 'f1-score': 0.9299886402833247, 'support': 15969}, 'weighted avg': {'precision': 0.9300229743161306, 'recall': 0.9299893543740998, 'f1-score': 0.9299902784915735, 'support': 15969}}
              precision    recall  f1-score   support

          no       0.93      0.93      0.93      7926
         yes       0.93      0.93      0.93      8043

    accuracy                           0.93     15969
   macro avg       0.93      0.93      0.93     15969
weighted avg       0.93      0.93      0.93     15969



In [69]:
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators = 50, random_state=1, max_features=12)
randomForest.fit(x_train, y_train)

# Predicting for test set
randomForest_y_pred               = randomForest.predict(x_test)

rdf_cross_validation_result = model_selection.cross_val_score(randomForest, x_train, y_train, cv=kfold, scoring='accuracy')
print (rdf_cross_validation_result)

# randomForest_results = pd.DataFrame([['Random Forest', randomForest_ScoreAccuracy, randomForest_PrecisonScore,
#                                 randomForest_RecollScore, randomForest_F1, rdf_cross_validation_result.mean(), rdf_cross_validation_result.std()]], 
#                               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Mean', 'Std Deviation'])
# print (randomForest_results)
# ensemble_results = ensemble_results.append(randomForest_results, ignore_index = True)

print (classification_report(y_test, y_pred,output_dict = True))
print (classification_report(y_test,y_pred))

[0.9456794  0.94802755 0.94787101 0.94630557 0.94395742 0.95115078
 0.94598403 0.94488805 0.95209018 0.95287302]
{'no': {'precision': 0.9259259259259259, 'recall': 0.9336361342417361, 'f1-score': 0.9297650458600327, 'support': 7926}, 'yes': {'precision': 0.9340604237181898, 'recall': 0.9263956235235609, 'f1-score': 0.9302122347066167, 'support': 8043}, 'accuracy': 0.9299893543740998, 'macro avg': {'precision': 0.9299931748220578, 'recall': 0.9300158788826485, 'f1-score': 0.9299886402833247, 'support': 15969}, 'weighted avg': {'precision': 0.9300229743161306, 'recall': 0.9299893543740998, 'f1-score': 0.9299902784915735, 'support': 15969}}
              precision    recall  f1-score   support

          no       0.93      0.93      0.93      7926
         yes       0.93      0.93      0.93      8043

    accuracy                           0.93     15969
   macro avg       0.93      0.93      0.93     15969
weighted avg       0.93      0.93      0.93     15969

