In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline #Magic command to include plots in the notebook

import statsmodels.api as sm
from scipy import stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
#Banking Data Frame
banking_df = pd.read_csv("../Data/bank-full.csv")

-----------------------------------------Explorartory Data Analysis----------------------------------------------------

In [None]:
#Start EDA - Exploratory Data Analysis
number_records = banking_df.shape[0]
number_columns = banking_df.shape[1]

print ("Number of Records: ",number_records)
print ("Number of Columns: ",number_columns)

In [None]:
print(banking_df.head())

In [None]:
#Data Type of Each Column
print (banking_df.dtypes)


INSIGHT:
--Multiple features are of string data type, so we will have to perform transformation into appropriate data type.


In [None]:
# Missing Value Exploration
print (banking_df.info()) #Count of Null Object

# Converting string into categorical
for feature in banking_df.columns: 
    if banking_df[feature].dtype == 'object': 
        banking_df[feature] = pd.Categorical(banking_df[feature])

print (banking_df.info())

In [None]:
#Generating the Value Count
print(banking_df.job.value_counts())
print('\n',banking_df.marital.value_counts())
print('\n',banking_df.education.value_counts())
print('\n',banking_df.default.value_counts())
print('\n',banking_df.housing.value_counts())
print('\n',banking_df.loan.value_counts())
print('\n',banking_df.contact.value_counts())
print('\n',banking_df.month.value_counts())
print('\n',banking_df.poutcome.value_counts())

In [None]:
print (banking_df.isnull().sum())
print (banking_df.isnull().values.any())
print (banking_df.isna().any())

for column in banking_df.columns:
    print (column,": ",sum(banking_df[column] == "none"))

In [None]:
# Generating Descriptive Statistical Report 
banking_df_transpose = banking_df.describe().T
print (banking_df_transpose)

INSIGHT:
    - Spread is Very High
    - We might have outliers in the data
    - We shoudnt go with mean as missing value replacement technique.
    - Columns are on different scale, so we might have to perform scaling (Standarization/Normalization)

In [None]:
# Detecting Outliers
sns.boxplot(data=banking_df, orient="h", palette="Set2")

In [None]:
banking_df.boxplot(return_type='axes',figsize=(30,10))

In [None]:
column_list = []
iqr_list = []
out_low = []
out_up = []
tot_ou = []
for column in banking_df.describe().columns:
    QTR1 = banking_df.describe().at['25%', column]
    QTR3 = banking_df.describe().at['75%', column]
    IQR = QTR3-QTR1
    LTV = QTR1 - 1.5 * IQR # lower bound 
    UTV = QTR3 + 1.5 * IQR # upper bound
    current_column = column
    current_iqr = IQR
    outliers_bl_low_bount = banking_df[banking_df[column] < LTV][column].count()
    outliers_bl_up_bount = banking_df[banking_df[column] > UTV][column].count()
    total_num_of_outliers = outliers_bl_low_bount + outliers_bl_up_bount
    
    column_list.append(current_column)
    iqr_list.append(current_iqr)
    out_low.append(outliers_bl_low_bount)
    out_up.append(outliers_bl_up_bount)
    tot_ou.append(total_num_of_outliers)

outlier_report = {"Column Name":column_list,"IQR":iqr_list,"Below Outliers":out_low,"Above Outliers":out_up,"Total No Of Outliers":tot_ou}
outlier_report = pd.DataFrame(outlier_report)

print (outlier_report)
    

In [None]:
"""----------------------------------------Visualization-------------------------------------------------------------"""

In [None]:
sns.pairplot(banking_df)

In [None]:
print (banking_df.Target.value_counts())

In [None]:
# Impact of Age on Target
fig,ax1 = plt.subplots()

#Age
bins = range(0,100,10)
sns.distplot(banking_df.age[banking_df.Target=='yes'],color='r',bins=bins,label="Subscribed",ax=ax1,kde=False)
sns.distplot(banking_df.age[banking_df.Target=='no'],color='b',bins=bins,label="Not Subscribed",ax=ax1,kde=False)
plt.legend()

INSIGHT: Age might be one important parameter, especially in range of 20-60.

In [None]:
# Impact of Jobs on Target
fig,ax2 = plt.subplots()
sns.countplot(banking_df['job'], data = banking_df, hue = 'Target', ax = ax2)
sns.despine(ax = ax2)
ax2.set_xlabel('Job', fontsize=5)
ax2.set_ylabel('Occurence', fontsize=5)
ax2.set_title('Job x Ocucurence', fontsize=5)
ax2.tick_params(labelsize=15)
ax2.set_xticklabels(banking_df['job'], rotation=90)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout() 
plt.legend(title="Subscribers",labels=["Not Subscribed","Subscribed"])

INSIGHT: Few profiles are helpful for classification

--------------------------------------------------Start The Modellig Process----------------------------------------------------

In [None]:
banking_sub_df = banking_df.iloc[:,[0,1,2,3,4,5,6,7,8,16]]
print (banking_sub_df.head())

In [None]:
# Dummy Variable Creation
categorical_column = ['job','marital','education','default','housing','loan','contact']
banking_sub_df = pd.get_dummies(banking_sub_df,columns=categorical_column)

print (banking_sub_df.shape)
print (banking_sub_df.columns)

Splitting the Data Set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X = banking_sub_df.drop('Target',axis=1) #Input Data Set
Y = banking_sub_df[["Target"]] #Label or Outcome Column


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=7)
print (y_train.Target.value_counts())
print (y_test.Target.value_counts())
print('x train data: ',x_train.shape)
print('y train data:',y_train.shape)
print('x test data : ',x_test.shape)
print('y test data :',y_test.shape)

In [None]:
#Lets apply scaling (Standarization or Normalization)
x_train_scaled = preprocessing.scale(x_train)
x_test_scaled = preprocessing.scale(x_test)

x_train = x_train_scaled
x_test  = x_test_scaled

In [None]:
print (x_train)
print (x_test)

In [40]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn import preprocessing
#Prepare for cross validation
seed = 10
kfold = model_selection.KFold(n_splits=10, random_state=seed)
LogReg = LogisticRegression(solver = 'lbfgs')
LogReg.fit(x_train, y_train)

# Predicting for test set
LogReg_y_pred               = LogReg.predict(x_test)
LogReg_Score                = LogReg.score(x_test, y_test)
print (LogReg_Score)

# LogReg_ScoreAccuracy        = accuracy_score(y_test, LogReg_y_pred)

# LogReg_PrecisonScore        = precision_score(y_test, LogReg_y_pred)
# LogReg_RecollScore          = recall_score(y_test, LogReg_y_pred)
# LogReg_F1                   = f1_score(y_test, LogReg_y_pred)

cross_validation_result = model_selection.cross_val_score(LogReg, x_train, y_train, cv=kfold, scoring='accuracy')
print(cross_validation_result)

# base_model_results = pd.DataFrame([['Logistic Regression', LogReg_ScoreAccuracy, LogReg_PrecisonScore,
#                                 LogReg_RecollScore, LogReg_F1, cross_validation_result.mean(), cross_validation_result.std()]], 
#                               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Mean', 'Std Deviation'])

print(classification_report(y_test, LogReg_y_pred))

  y = column_or_1d(y, warn=True)


0.8875373216852814


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0.88692286 0.86895217 0.88471109 0.87918164 0.87973459 0.88388167
 0.88028753 0.88277578 0.88855088 0.88274336]
              precision    recall  f1-score   support

          no       0.89      1.00      0.94      8027
         yes       0.33      0.00      0.00      1016

    accuracy                           0.89      9043
   macro avg       0.61      0.50      0.47      9043
weighted avg       0.83      0.89      0.83      9043

