In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Bank Marketing Dataset

The goal of this dataset is to explore whether a client will subscribe to a term deposit. This data comes from a Portuguese banking institution and was during a direct marketing campaign (phone calls).

## UCI-Bank-Marketing-Dataset

The following information is drawn from the UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/bank+marketing

**Abstract**
The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

**Data Set Information**
The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.

The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).

## Attribute information
### Input variables

### Bank client data
* age (numeric)
* job : type of job (categorical)
* marital : marital status (categorical)
* education (categorical)
* default: has credit in default? (categorical)
* housing: has housing loan? (categorical)
* loan: has personal loan? (categorical)

## Related with the last contact of the current campaign
* contact: contact communication type (categorical)
* month: last contact month of year (categorical)
* day_of_week: last contact day of the week (categorical)
* duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

### Other attributes
* campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
* pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
* previous: number of contacts performed before this campaign and for this client (numeric)
* poutcome: outcome of the previous marketing campaign (categorical)


### Social and economic context attributes
* emp.var.rate: employment variation rate - quarterly indicator (numeric)
* cons.price.idx: consumer price index - monthly indicator (numeric)
* cons.conf.idx: consumer confidence index - monthly indicator (numeric)
* euribor3m: euribor 3 month rate - daily indicator (numeric)
* nr.employed: number of employees - quarterly indicator (numeric)

### Output variable (desired target)
* y - has the client subscribed a term deposit? (binary: 'yes','no')


### Relevant Papers
S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimaraes, Portugal, October, 2011. EUROSIS. [bank.zip]

In [None]:
# Load in the data wrangling Libraries
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Load the dataset and explore what the data looks like.

In [None]:
# Load data
#----------
bank = pd.read_csv('/kaggle/input/bank-marketing-data-set/bank-additional-full.csv', sep=';')

# Explore the top of the dataset
#------------------------------
bank.head()

**Additional information about the dataset.**

In [None]:
# Explore Additional Information about this dataset
#--------------------------------------------------
f = open('/kaggle/input/bank-marketing-data-set/bank-additional-names.txt', "r")
print(f.read())

In [None]:
# Check for null values and the datatypes
#----------------------------------------
bank.info()

In [None]:
# Double check for null values
#-----------------------------
bank.isnull().sum()

Checked for Null values two ways and this dataset appears not to have null values.

In [None]:
# How many rows and columns does the dataset have
#------------------------------------------------
bank.shape

In [None]:
# Quick statistical run down of the numerical columns
#----------------------------------------------------
bank.describe()

The different features show that they differ in the range of values. When modeling, standardization is required. 

In [None]:
# Some of the column names have periods. Get rid of the periods in the column names
bank.columns

In [None]:
#  Replace columns names that have periods (.) with an underscore (_)
new_column_names = bank.columns

bank.columns = new_column_names.str.replace('.','_')

bank.head()

# Exploratory Data Analysis (EDA)
The goals of exploratory data analysis Our goals for exploring data are the following:

* Assemble.
* Clean
* Explore
* Summarize

More specifically, we'll look at the following topics.

### Exploratory Data Analysis (EDA)
* Exploratory data analysis (EDA) - the process of going through a dataset and finding out more about it.

### Model Building
* Model training - create model(s) to learn to predict a target variable based on other variables.
* Model evaluation - evaluating a models predictions using problem-specific evaluation metrics.
* Model comparison - comparing several different models to find the best one.
* Model fine-tuning - once we've found a good model, how can we improve it?
* Feature importance - since we're predicting the presence of heart disease, are there some things which are more important for prediction?
* Cross-validation - if we do build a good model, can we be sure it will work on unseen data?
* Reporting what we've found - if we had to present our work, what would we show someone?

To work through our analysis, we'll use pandas, Matplotlib and NumPy for data anaylsis, then well use, Scikit-Learn for machine learning and modelling tasks.

In [None]:
# Target Variable (Look at the counts)
#------------------------------------
bank['y'].value_counts(dropna=False)

In [None]:
# Target Variable (Look at the percentages)
#------------------------------------------
bank['y'].value_counts(normalize=True, dropna=False)

There are more no's then yes's in this dataset, which is probably what is expected in the bank marketing subscriber environment. This is also the case of an imbalanced dataset and care must be taken into consideration when looking at accruacy, precision and recall. 

In [None]:
# plot the target variable and check just how imbalance the target variable is
plt.style.use('fivethirtyeight')
plt.figure(figsize=(8,6))

sns.countplot(data = bank, x = 'y')

plt.xlabel('')
plt.title('Has The Client Subscribed a Term Deposit?')
plt.tight_layout();
plt.show()

In [None]:
# Add labels
labels=bank['y'].value_counts().index
values=bank['y'].value_counts().values

plt.figure(figsize = (8, 6))
ax = sns.barplot(x=labels, y=values)
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1, values[i],ha="center")
    
plt.xlabel('')
plt.title('Has The Client Subscribed a Term Deposit?')
plt.tight_layout();
plt.show()

The plot shows that there is an imbalance between the number of no's and yes's which is the label dataset

Lets look at the age distribution of those that were subscribed the loan and those that were not.

In [None]:
plt.figure(figsize = (10, 12))
plt.style.use('default')
g = sns.displot(data=bank, x='age', hue='y', bins=30, kde = False, legend=False)

plt.title('Age Distribution')
plt.xlabel('Age')

plt.legend(title='Subscribed Term', loc='upper right', labels=['yes', 'no'])
plt.show()

In [None]:
# call histplot on each axes
fig, axes = plt.subplots(1, 2)
plt.style.use('default')

#define figure size
sns.set(rc={"figure.figsize":(8, 4)})
 
sns.histplot(bank.loc[bank['y']=='yes']['age'], bins=30, kde = True,color='#ffa54c' , ax=axes[0])
axes[0].set_xlabel("Age", fontsize = 10)
axes[0].set_title('Age Distribution (Yes)')

sns.histplot(bank.loc[bank['y']=='no']['age'], bins=30, kde = True, color='#539100', ax=axes[1])
axes[1].set_xlabel("Age", fontsize = 10)
axes[1].set_ylabel('')
axes[1].set_title('Age Distribution (No)')

plt.show();

The age distribution between both the no's and the yes's is very similar

In [None]:
# Boxplot of Yes's for Age
#-------------------------
yes_data = bank.loc[bank['y']=='yes']

sns.set_style('darkgrid')
g = sns.boxplot(data=yes_data,y='y',x='age',orient = 'h', color = '#bb1587')
g.set_title('Age Distribution (Yes)')

plt.show(g);

**The majority of clients that are subscribed a loan deposit ranges between the ages of 30 and 50, which is also prime income ages.**

In [None]:
#=======================================
# Create "Wrapped" Small Multiple Chart
#=======================================
grid_layout = sns.FacetGrid(bank, col = 'job', hue='y', col_wrap = 3)
grid_layout.map(plt.hist, 'age')
plt.title('Job Distribution')
plt.show()

In [None]:
grid_layout = sns.FacetGrid(bank, col = 'education', hue='y', col_wrap = 4)
grid_layout.map(plt.hist, 'age');

In [None]:
grid_layout = sns.FacetGrid(bank, col = 'marital', hue='y', col_wrap = 4)
grid_layout.map(plt.hist, 'age');

**More married couples applied and subscribed a term deposit.**

In [None]:
# Check the top of the dataset again
#-----------------------------------
bank.head()

In [None]:
# Re-label Values in the Eduation field
#--------------------------------------
educ_dict_mapping = {'basic.4y': 'Basic 4year',
                     'high.school': 'High School',
                     'basic.6y': 'Basic 6year',
                     'basic.9y': 'Basic 9year',
                     'professional.course':'Professional Course',
                     'unknown': 'Unknown',
                     'university.degree':'University Degree',
                     'illiterate':'Illiterate'}

bank['education'] = bank['education'].replace(educ_dict_mapping)
bank['education'].unique()

In [None]:
sns.countplot(data = bank, y = 'education', order = bank['education'].value_counts().index)
plt.ylabel('Education')
plt.title('Education Level')
plt.show()

In [None]:
# y=yes versus education
# Boxplot of Yes's for Age
yes_data = bank.loc[bank['y']=='yes']

sns.set_style('darkgrid')
g = sns.boxplot(data=yes_data,y='education',x='age',orient = 'h')

g.set_xlabel('Age')
g.set_ylabel('Education Level')
g.set_title('Age Distribution of Education of those Subscribed Loan Deposit')

plt.show(g);

This graph demonstrates the more education one has the earlier in age one can obtain a subscribed loan deposit. This is probably due to a number of things, income level, savings amount etc. Lets also explore the distribution of each education category of those attaining a loan.

In [None]:
yes_data['education'].value_counts(normalize=True).sort_values(ascending=False)

In [None]:
education_percentages = (yes_data['education'].value_counts(normalize=True).rename('percentage').mul(100)
                     .reset_index()
                     .sort_values('percentage', ascending=False))

# Rename the Index field to Education Level
education_percentages.rename(columns={'index':'education_level'}, inplace=True)

# Print out the results before plotting
education_percentages

Lets explore the educational level of those members who were subscribed a loan.

In [None]:
# Set the plot up
ax = sns.barplot(data=education_percentages,x='education_level',y='percentage')

# Set up the axes
ax.set_xlabel('Education Level')
ax.set_ylabel('Percentage')
ax.set_title('Percentage of Clients Subscribed Loan Deposit (Education)')

# Rotate the axes labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Look at those Subscribed a loan job occupation
#-----------------------------------------------
sns.set_style('darkgrid')
g = sns.boxplot(data=yes_data,y='job',x='age',orient = 'h')

g.set_xlabel('Age')
g.set_ylabel('Job Occupation')
g.set_title('Job Occupation of those Subscribed Loan Deposit')

plt.show(g);

In [None]:
# Another way to look at job occupations of those Subscribed
#-----------------------------------------------------------
ax = sns.countplot(data=yes_data,x='job', order=yes_data.job.value_counts().index)

# Set up the plot axes
sns.set_style("whitegrid")
ax.set_ylabel('Count')
ax.set_xlabel('Job Occupation')
ax.set_title('Job Occupation of those Subscribed Loan Deposit')

# Rotate the x labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

For job occupation, in order to be subscribed a loan deposit, the data shows that the client has to be more likely a part of the standard job occutations as compared with other occupations such as entrepreneur, housemaid. Almost proof of steady income.

In [None]:
# Not sure how important the day of the week, last contact 
plt.style.use('fivethirtyeight')
plt.figure(figsize=(8,6))

sns.countplot(data = bank, x = 'day_of_week')
plt.xlabel('Day')
plt.title('Day of Week')
plt.show()

# Modeling I: Feature Engineering and Selection

In [None]:
# Before we start with feature engineering, lets review our dataset again
bank.info()

In [None]:
# Find the numerical features
#----------------------------
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_features = bank.select_dtypes(include=numerics).columns.tolist()

# Numerical Features
num_features

In [None]:
# Find the categorical variables
#-------------------------------
objects = ['object','category', 'bool']
cat_features = bank.select_dtypes(include=objects).columns.tolist()

# Remove the target varible
cat_features.remove('y')

# Categorical Features
cat_features

## Numerical Feature Selection

In [None]:
# employment variation rate - quarterly indicator (numeric)   # Maybe this value affects clients getting approved for a loan deposit
bank['emp_var_rate'].value_counts()

In [None]:
# duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output 
# target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after 
# the end of the call y is obviously known. 
# Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have 
# a realistic predictive model.

sns.catplot(data=bank, x='y',y='duration')

In [None]:
g = sns.boxplot(data=bank, x='duration', y='y',orient = 'h')

g.set_xlabel('Duration')
g.set_ylabel('Subscribed Loan Deposit')
g.set_title('Duration of Last Phone Call')

plt.show(g);

In [None]:
# Look at the numberical features correlation
bank[num_features].corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(bank[num_features].corr())

In [None]:
# Create a correlation matrix
corr_metrics = bank[num_features].corr(method='pearson')
corr_metrics.style.background_gradient()

### Definition of some numerical features
* euribor3m: euribor 3 month rate - daily indicator (numeric)
* nr.employed: number of employees - quarterly indicator (numeric)
* emp.var.rate: employment variation rate - quarterly indicator (numeric)

The correlation between euribor3m and nr_employed is extremely high 94.51%

So is:
* euribor3m - emp_var_rate: 97.2256
* nr_employed - emp_var_rate: 90.60%
* emp_var_rate - cons_price_idx: 77.53%
* cons.price.idx: consumer price index - monthly indicator (numeric)

Which features to keep and which to disregard because of high correlation

In [None]:
# If the correlation between two features is extremely high we will discard one of the features. We want to avoid multicollinearity
columns = np.full((corr_metrics.shape[0],), True, dtype=bool)
for i in range(corr_metrics.shape[0]):
    for j in range(i+1, corr_metrics.shape[0]):
        if corr_metrics.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
                
                
numerical_selected_columns = corr_metrics.columns[columns]

In [None]:
numerical_selected_columns

In [None]:
bank[numerical_selected_columns].corr(method='pearson').style.background_gradient()

The correlation between the cons_price_idx and the emp_var_rate is still fairly high but we will watch out for it during modeling

In [None]:
bank[numerical_selected_columns].corr(method='pearson').style.background_gradient()

## Categorical Features Pre-Processing

In [None]:
bank['job'] = bank['job'].str.replace('-','_').str.replace('.','', regex=True)

bank['job'].value_counts()

In [None]:
#poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
bank['poutcome'].value_counts()

In [None]:
# Create a Contigency table
#--------------------------
contingency_table = pd.crosstab(
    bank['y'],
    bank['job'],
    margins = True
)
contingency_table

In [None]:
#Assigns the frequency values
#----------------------------
no_count = contingency_table.iloc[0][0:6].values
yes_count = contingency_table.iloc[1][0:6].values

#Plots the bar chart
fig = plt.figure(figsize=(10, 5))
sns.set(font_scale=1.8)
categories = ["admin","Blue Collar","Entrepreneur","Housemaid","Management","Retired"]
p1 = plt.bar(categories, no_count, 0.55, color='#d62728')
p2 = plt.bar(categories, yes_count, 0.55, bottom=no_count)
plt.legend((p2[0], p1[0]), ('Yes', 'No'))


plt.xlabel('Job Occupation')
plt.xticks(rotation=40)
#plt.ylabel('Count')
plt.show()

## Load Machine Learning Tools

In [None]:
# Machine Learning Imports
#--------------------------------
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# Preprocessing
#--------------------------------
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


# Machine Learning Algorithms
#--------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import ExtraTreesClassifier


# Model Selection
#------------------------------------
from sklearn.model_selection import GridSearchCV


# Metrics
#-------------------------------
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

### Preprocess the Data
The next step is to set up a pipeline to preprocess the features. We will impute all missing values with a constant, and one-hot encode all categorical features.

In [None]:
# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()#.remove('duration')

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

## Fit the Models and Evaluate Performance

In [None]:
# Create the cleaned up dataset
features = num_features + cat_features
X = bank[features]
y = bank["y"]

## Logistic Regression (Base Model)

In [None]:
# Import train_test_split function
#---------------------------------
from sklearn.model_selection import train_test_split

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear'))   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)
y_preds

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_preds)

In [None]:
# Probably easier to visualize
#-----------------------------

# One way to do it is using pd.crosstab()
pd.crosstab(y_test, 
            y_preds, 
            rownames=["Actual Label"], 
            colnames=["Predicted Label"])

In [None]:
# Make our confustion matrix more visual with Seaborn's heatmap()
#----------------------------------------------------------------
import seaborn as sns

# Set the font scale
sns.set(font_scale=1.5)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using Seaborn
sns.heatmap(conf_mat)
plt.show()

In [None]:
def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(8,6))
    ax = sns.heatmap(conf_mat,
                     annot=True,      # Annotate the boxes
                     cbar=False)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
   
    
plot_conf_mat(conf_mat)    

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

Given the precision is high for no but low for yes and the recall is also very high, we also know that we have an unbalanced dataset,we will stratify the dataset based on the target variable

### Unbalanced Dataset lets stratify the dataset train and test sit and fit the model again

In [None]:
# Creating a preprocessing and modelling pipepline (without taking into account)
#-------------------------------------------------------------------------------
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("clf", LogisticRegression(solver='liblinear'))   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)   # Added stratify since the  
                                                                                       #   dataset is unbalanced

# Train set class weights
pd.Series(y_train).value_counts(normalize=True)

In [None]:
# Test set class weights
pd.Series(y_test).value_counts(normalize=True)

### Notes:
We commonly use train_test_split function of Sklearn to divide the data and Sklearn provides handy argument - stratify to generate stratified splits

Setting stratify to the target (y) yielded identical distributions in both the train and test sets.
Altered class weights are a serious problem that might make a model more biased towards a particular class. Forgetting to generate stratified splits might result in a more favorable train or test sets 

When using cross-validation or pipelines, you don’t have to worry about this problem because CV splitters perform stratification under the hood using StratifiedKFold for classification problems.

In [None]:
# Fit and score the model
#------------------------
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report
#----------------------

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


def evaluate_preds(y_true, y_preds):
    """
    Perfoms evlauation comparison on y_true labels vs. y_pred labels.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds, pos_label="yes")
    recall = recall_score(y_true, y_preds, pos_label="yes")
    f1 = f1_score(y_true, y_preds, pos_label="yes")
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2), 
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")

    return metric_dict

In [None]:
precision_score(y_test, y_preds, pos_label="yes")

In [None]:
# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_test, y_preds)
baseline_metrics

In [None]:
model.named_steps

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
num_features

In [None]:
#model['preprocessor'].transformers_[0][1][0].get_feature_names(num__features)#[0][0]#['imputer']#.get_feature_names(num__features)

In [None]:
model['preprocessor'].transformers_[1][1]

In [None]:
#from feature_importance import FeatureImportance
#feature_importance = FeatureImportance(pipe)
#feature_importance.plot(top_n_features=25)
model['preprocessor'].transformers_[1][1]['onehot']\
                   .get_feature_names(cat_features)

In [None]:
logistic_feature_names = num_features + list(model['preprocessor'].transformers_[1][1]['onehot'].get_feature_names(cat_features))

logistic_feature_names

In [None]:
# Get the names of each feature
#feature_names = model.named_steps["preprocessor"]#.get_feature_names()
X.columns#feature_names

In [None]:
# Get the coefficients of each feature
#model['clf']
# Get the coefficients of each feature
coefs = model.named_steps["clf"].coef_.flatten()
coefs

In [None]:
col_names_coef = pd.concat([pd.Series(logistic_feature_names), pd.Series(coefs)], axis=1)

data = {'feature_name': logistic_feature_names,
        'coefficients': coefs  
}

logistic_df = pd.DataFrame(data)

In [None]:
# Sort the features by the absolute value of their coefficient
logistic_df["abs_value"] = logistic_df["coefficients"].apply(lambda x: abs(x))
logistic_df["colors"] = logistic_df["coefficients"].apply(lambda x: "green" if x > 0 else "red")
logistic_df = logistic_df.sort_values("abs_value", ascending=False)



fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature_name",
            y="coefficients",
            data=logistic_df.head(20),
           palette=logistic_df.head(20)["colors"])


ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)

plt.savefig("top_20_features.jpg") #save as jpg
plt.show()

Explain what the features coeffients represent as well as the odds ratio.

### The Dataset is an imbalanced Dataset with more No's then Yes's

Lets shuffle the dataset and resample to make sure they have the same number of each sample

In [None]:
# Subset only the No tracks, and then only the yes tracks
deposit_no = bank.loc[bank['y']=='no']
deposit_yes = bank.loc[bank['y']=='yes']

# sample the rocks songs to be the same number as there are hip-hop songs
deposit_no = deposit_no.sample(n = len(deposit_yes),random_state=10)


# concatenate the dataframes rock_only and hop_only
bank_sampled = pd.concat([deposit_no, deposit_yes])


features = num_features + cat_features
X_sample = bank_sampled[features]
y_sample = bank_sampled["y"]

In [None]:
X_sample.shape

In [None]:
# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("clf", LogisticRegression(solver='liblinear'))   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2)


# Train set class weights
pd.Series(y_train).value_counts(normalize=True)

In [None]:
# Test set class weights
pd.Series(y_test).value_counts(normalize=True)

In [None]:
# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_test, y_preds)
baseline_metrics

In [None]:
from sklearn.metrics import plot_roc_curve

plot_roc_curve(model, X_test, y_test)


#y_score = model.predict(X_test)
#y_score = model.decision_function(X_test)

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

## Logistic Regression (Taking Into Account Ordinal Features)

In [None]:
bank2 = pd.read_csv('/kaggle/input/bank-marketing-data-set/bank-additional-full.csv', sep=';')

#  Replace columns names that have periods (.) with an underscore (_)
new_column_names2 = bank2.columns

bank2.columns = new_column_names2.str.replace('.','_')

bank2.head()

In [None]:
numerical_selected_columns

In [None]:
# Categorical Features
cat_features = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome']

education_order = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
month_order = ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri']

In [None]:
features = num_features + cat_features + ['education', 'month', 'day_of_week'] #, 'month', 'day_of_week']

# Create dataset to train and test
X = bank2[features]
y = bank2["y"]

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()#.remove('duration')

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
])


# Day of Week Categorical Features
day_feature = ['day_of_week']
day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])



In [None]:
# Import train_test_split function 
from sklearn.model_selection import train_test_split

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear'))   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
X_train.shape, y_train.shape

In [None]:
# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)
y_preds

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_test, y_preds)
baseline_metrics

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

## Decicion Tree Classifier Regression

In [None]:
# Create the cleaned up dataset
features = num_features + cat_features
X = bank[features]
y = bank["y"]

# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()#.remove('duration')

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("clf", DecisionTreeClassifier())   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Train set class weights
pd.Series(y_train).value_counts(normalize=True)

In [None]:
# Train set class weights
pd.Series(y_test).value_counts(normalize=True)

In [None]:
# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Use the model to make predictions on the test data (further evaluation)
y_preds = model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat)

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn import metrics
y_pred = pd.Series(model.predict(X_test))

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
#from sklearn.tree import plot_tree
#
#fig = plt.figure(figsize=(25,20))
#_ = plot_tree(model.named_steps["clf"], feature_names=X_train.columns),  
#                #class_names=class_types, filled=True)

In [None]:
model.named_steps['clf']

In [None]:
#from sklearn import tree
#import graphviz 

#dot_data = tree.export_graphviz(model.named_steps['clf'], out_file=None)#, 
                                #feature_names=X_train.columns,  
                                #class_names=y_train,  
                                #filled=True, rounded=True,  
                                #special_characters=True)

#graph = graphviz.Source(dot_data) 

#graph 

In [None]:
#split = KFold(n_splits=4, shuffle=True, random_state=1234)

## Choose a class of models and specify hyperparameters

The next step is to choose a class of models and specify hyperparameters. This is just for starters and we will see later how we can specify a range of values for hyperparameters and tune the model for optimal performance! We will pick the simple, yet very effective Decision Tree and Random Forest models. We will use scikit-learn to fit the models and evaluate their performance.

## Fit Model and Evaluate Performance

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
models = [
    ('LR', LogisticRegression(solver='liblinear')),
    ('KNN', KNeighborsClassifier()),
   # ('NB', GaussianNB()),
   # ('SVM', SVC(gamma='auto')),
  ("Decision Tree", DecisionTreeClassifier()),
  ("Random Forest", RandomForestClassifier(random_state=1234,n_jobs=-1)),
  
]

In [None]:
# Create the cleaned up dataset
features = num_features + cat_features
X = bank[features]
y = bank["y"]

# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()#.remove('duration')

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [None]:
results = []
names = []


num_folds = 10
seed = 7
scoring = 'accuracy'


#split = KFold(n_splits=4, shuffle=True, random_state=1234)

for name, model in models:
    
    # Provides train/test indices to split data in train/test sets. 
    #  Split dataset into k consecutive folds (without shuffling by default).
    split = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    
    # Compose data preprocessing and model into a single pipeline
    steps = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Compute cross validation accuracy for each model
    cv_results = cross_val_score(steps, X, y, cv=split, scoring="accuracy", n_jobs=-1)
    results.append(cv_results)
    names.append(name)
    
    # output:
    min_score = round(np.min(cv_results), 4)
    max_score = round(np.max(cv_results), 4)
    mean_score = round(np.mean(results), 4)
    std_dev = round(np.std(results), 4)
    print(f"[{name}] Cross Validation Accuarcy Score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

In [None]:
steps.fit(X, y)

In [None]:
#features_1 = features[0:10]
#importances = steps[1].feature_importances_[0:10]
#indices = np.argsort(importances)
#
#plt.title('Feature importances')
#plt.barh(range(len(indices)), importances[indices], color='b', align='center')
#plt.yticks(range(len(indices)), [features[i] for i in indices])
#plt.xlabel('relative importance')
#plt.show()

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Use Stratifield kfold instead of KFold since the classes are imbalaned

In [None]:
from sklearn.model_selection import StratifiedKFold

results = []
names = []


num_folds = 10
seed = 7
scoring = 'accuracy'


for name, model in models:
    
    # Stratified K-Folds cross-validator.
    #     Provides train/test indices to split data in train/test sets.
    #     This cross-validation object is a variation of KFold that returns stratified folds. 
    #     The folds are made by preserving the percentage of samples for each class.
    split = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    
    # Compose data preprocessing and model into a single pipeline
    steps = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Compute cross validation accuracy for each model
    cv_results = cross_val_score(steps, X, y, cv=split, scoring="accuracy", n_jobs=-1)
    results.append(cv_results)
    names.append(name)
    
    # output:
    min_score = round(np.min(cv_results), 4)
    max_score = round(np.max(cv_results), 4)
    mean_score = round(np.mean(results), 4)
    std_dev = round(np.std(results), 4)
    print(f"[{name}] Cross Validation Accuarcy Score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Logistic Regression seems to be the best model, Going to optimize the parameter space to find the best hyperparameters and model

## Create Hyperparameter Search Space

In [None]:
from scipy.stats import uniform

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
#C = [.001,.01, 1.0]#uniform(loc=0, scale=4)

# Create hyperparameter options
#hyperparameters = dict(C=C, penalty=penalty)
hyperparameters = {"model__penalty":penalty,
                   "model__C": np.logspace(-4, 4, 20)}   # Has to line up with the name of the model (classifier) in the pipeline

#{'penalty':['l1', 'l2'],
#                   '"logistic__C": np.logspace(-4, 4, 4)':C}
#print(C)

In [None]:
# Create the cleaned up dataset
features = num_features + cat_features
X = bank[features]
y = bank["y"]

# From preprocessed numerical features remove duration 
num_features = numerical_selected_columns.tolist()#.remove('duration')

num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear'))   # ‘liblinear’ - [‘l1’, ‘l2’]
                       ])


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Create randomized search 5-fold cross validation and 100 iterations
#clf = RandomizedSearchCV(model, hyperparameters, cv=5, verbose=0)#, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)

# Fit randomized search
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['model__C'])

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = best_model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = best_model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

## Create Hyperparameter Grid Search II

In [None]:
#num_features
num_features = ['age', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx']

In [None]:
bank2.head()

In [None]:
# Define the Dataset
# Categorical Features
cat_features = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome']

education_order = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
month_order = ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri']



features = num_features + cat_features + ['education', 'month', 'day_of_week'] #, 'month', 'day_of_week']

# Create dataset to train and test
X = bank2[features]
y = bank2["y"]

In [None]:
X.head()

In [None]:
# Hyperparameter Space
#------------------------
from scipy.stats import uniform

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C =np.logspace(-4, 4, 20)

# Create hyperparameter options
hyperparameters = {"model__penalty": penalty,
                   "model__C": C}   # Has to line up with the name of the model (classifier) in the pipeline

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# From preprocessed numerical features remove duration 
num_features = num_features #numerical_selected_columns.tolist()#.remove('duration')

#num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
])


# Day of Week Categorical Features
day_feature = ['day_of_week']
day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear'))   # solver='liblinear'‘liblinear’ - [‘l1’, ‘l2’]
                       ])


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Create randomized search 5-fold cross validation and 100 iterations
#clf = RandomizedSearchCV(model, hyperparameters, cv=5, verbose=0)#, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)

# Fit randomized search
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['model__C'])

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = best_model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = best_model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)

## Hyperparameter Space For RandomForestClassifier

In [None]:
# Hyperparameter Space
#------------------------
from scipy.stats import uniform

param_grid = [
    #{'classifier' : [RandomForestClassifier()],
    {'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# From preprocessed numerical features remove duration 
num_features = num_features #numerical_selected_columns.tolist()#.remove('duration')

#num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
])


# Day of Week Categorical Features
day_feature = ['day_of_week']
day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])

# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("classifier", RandomForestClassifier())   # solver='liblinear'‘liblinear’ - [‘l1’, ‘l2’]
                       ])

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Create randomized search 5-fold cross validation and 100 iterations
#clf = RandomizedSearchCV(model, hyperparameters, cv=5, verbose=0)#, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
clf = GridSearchCV(model, param_grid, cv=5, verbose=0)

# Fit randomized search
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Number of n estimators:', best_model.best_estimator_.get_params()['classifier__n_estimators'])
print('Best Number of maximum features:', best_model.best_estimator_.get_params()['classifier__max_features'])

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = best_model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = best_model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)


In [None]:
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

**Notes:**

**OneHotEncoder** can be used for transforming your independent variables according to how one-hot-encoding works. It is not really intended to be used on your dependent variables.

The **OrdinalEncoder** can be used if you can order / rank your independent variables, e.g., small, medium, large, very large. This is also not intended to be used on your dependent variables.

The third one one, **LabelEncoder**, is used when you want to transform your dependent variables into classes, e.g., :
[1, 1, 2, 6] -> [0, 0, 1, 2]. This is only intended to be used with your LABELS, i.e., your dependent variables, and not your independent variables.

## Explore Another Way to Address the Imbalance of Classes (SMOTE)

In [None]:
# Hyperparameter Space
#------------------------
from scipy.stats import uniform

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C =np.logspace(-4, 4, 20)

# Create hyperparameter options
hyperparameters = {"model__penalty": penalty,
                   "model__C": C}   # Has to line up with the name of the model (classifier) in the pipeline

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# From preprocessed numerical features remove duration 
num_features = num_features #numerical_selected_columns.tolist()#.remove('duration')

#num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
])


# Day of Week Categorical Features
day_feature = ['day_of_week']
day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])

In [None]:
# Creating a preprocessing and modelling pipepline (without taking into account )
import imblearn
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as imbPipeline


sampler = SMOTE()
#smp_pipeline = make_pipeline(sampler, classifier)

#model = Pipeline(steps=[("preprocessor",preprocessor),
#                        ("model", LogisticRegression(solver='liblinear'))   # solver='liblinear'‘liblinear’ - [‘l1’, ‘l2’]
#                       ])

smp_pipeline = imbPipeline(steps=[("preprocessor",preprocessor),
                               ("sampler", sampler),
                        ("model", LogisticRegression(solver='liblinear'))   # solver='liblinear'‘liblinear’ - [‘l1’, ‘l2’]
                       ])


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
# Create randomized search 5-fold cross validation and 100 iterations
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
clf = RandomizedSearchCV(smp_pipeline, hyperparameters, cv=cv, verbose=0)#, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
#clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)

# Fit randomized search
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['model__C'])

In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = best_model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = best_model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)

## Add Class weights to Logistic Regression Model

In [None]:
# Machine Learning Imports
#--------------------------------
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score


# Preprocessing
#--------------------------------
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer


# Machine Learning Algorithms
#--------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import ExtraTreesClassifier

# Model Selection
#------------------------------------
from sklearn.model_selection import GridSearchCV


# Metrics
#-------------------------------
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

In [None]:
bank2 = pd.read_csv('/kaggle/input/bank-marketing-data-set/bank-additional-full.csv', sep=';')

#  Replace columns names that have periods (.) with an underscore (_)
new_column_names2 = bank2.columns

bank2.columns = new_column_names2.str.replace('.','_')

bank2.head()

In [None]:
bank2.columns

In [None]:
duplicate = bank2[bank2.duplicated()]
  
print(len(duplicate))

In [None]:
bank2.shape

In [None]:
bank2 = bank2.drop_duplicates()
bank2.shape

In [None]:
#num_features
num_features = ['age', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'nr_employed', 'euribor3m']

# Define the Dataset
# Categorical Features
cat_features = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome']

education_order = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
month_order = ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri']

# Combine numerical and Categorial features
features = num_features + cat_features + ['education', 'month', 'day_of_week'] #, 'month', 'day_of_week']

# Create dataset to train and test
X = bank2[features]
y = bank2["y"]


# Encode for string labels
label_encoder = LabelEncoder().fit(y)
y = label_encoder.transform(y)

In [None]:
X.head()

In [None]:
bank2['nr_employed'].value_counts()

In [None]:
bank2['euribor3m'].value_counts()

In [None]:
bank2['emp_var_rate'].value_counts()

In [None]:
bank2.dtypes

In [None]:
# Hyperparameter Space
#------------------------
from scipy.stats import uniform

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C =np.logspace(-4, 4, 20)

# Create hyperparameter options
hyperparameters = {"model__penalty": penalty,
                   "model__C": C}   # Has to line up with the name of the model (classifier) in the pipeline


In [None]:
#bank2[num_features].head()

In [None]:
# From preprocessed numerical features remove duration 
num_features = num_features #numerical_selected_columns.tolist()#.remove('duration')

#num_features.remove('duration')

# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
])


# Day of Week Categorical Features
day_feature = ['day_of_week']
day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, stratify=y)

In [None]:
X_train.columns

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score



# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear', class_weight='balanced'))
                       ])


In [None]:

num_folds = 10
seed = 7
scoring = 'roc_auc'


# define evaluation procedure
split = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# evaluate model
scores = cross_val_score(model, X_train, y_train, cv=split ,scoring='roc_auc', n_jobs=-1)

# summarize performance
print('Mean ROC AUC: %.3f' % np.mean(scores))

In [None]:
scores

In [None]:
X_train

In [None]:
split = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(model, hyperparameters, cv=split, verbose=0)#, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)


# Fit randomized search
best_model = clf.fit(X_train, y_train)


# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['model__C'])

In [None]:
# Creating a preprocessing and modelling pipepline (without taking into account )
model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model", LogisticRegression(solver='liblinear'))#,class_weight='balanced'))
                       ])




balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
#param_grid = dict(class_weight=balance)


hyperparameters = {"model__penalty": penalty,
                   "model__C": C,
                   "model__class_weight": balance}  

# define evaluation procedure
split = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)


clf = GridSearchCV(model, hyperparameters, cv=split, scoring='roc_auc', n_jobs=-1)

# Fit randomized search
best_model = clf.fit(X_train, y_train)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['model__C'])

In [None]:
# report the best configuration
print("Best: %f using %s" % (best_model.best_score_, best_model.best_params_))
# report all configurations
means = best_model.cv_results_['mean_test_score']
stds = best_model.cv_results_['std_test_score']
params = best_model.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(8,6))
    ax = sns.heatmap(conf_mat,
                     annot=True,      # Annotate the boxes
                     cbar=False)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")


In [None]:
# Use the model to make predictions on the test data (further evaluation)
y_preds = best_model.predict(X_test)

# Create a confustion matrix
conf_mat = confusion_matrix(y_test, y_preds)

plot_conf_mat(conf_mat) 

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
# Classification Report

print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = best_model.predict_proba(X_test)

# Keep the probabilites of the positive class only
y_probs_positive = y_probs[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)#, pos_label='yes')

#y_probs[:10], len(y_probs)
y_probs_positive[:10], len(y_probs_positive)

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color = "darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)

In [None]:
from sklearn import metrics
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

## References

* [Classification Models a Cautionary Tale](https://towardsdatascience.com/imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2-cf371500d1b3)
* [Cost-sensitive-logistic-regression](https://machinelearningmastery.com/cost-sensitive-logistic-regression/)
* [machine-learning-case-study-a-data-driven-approach-to-predict-the-success-of-bank-telemarketing](https://towardsdatascience.com/machine-learning-case-study-a-data-driven-approach-to-predict-the-success-of-bank-telemarketing-20e37d46c31c)
* [fourth item](https://towardsdatascience.com/https-medium-com-faizanahemad-generating-synthetic-classification-data-using-scikit-1590c1632922)
* []()

In [None]:
import xgboost as xgb

#xgb_cl = xgb.XGBClassifier()

In [None]:
# From preprocessed numerical features remove duration 
num_features = num_features 


# This transformer replaces the NaN values with a specified placeholder. 
# If “constant”, then replace missing values with fill_value.
num_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('std_scaler', StandardScaler()),
])

# Categorical Features
cat_features = cat_features

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Education Categorical Features
education_feature = ['education']
educ_pipeline = Pipeline(steps=[
    ("ordinal_educ", OrdinalEncoder(categories=[education_order]))
])

# Month Categorical Features
month_feature = ['month']
month_pipeline = Pipeline(steps=[
    ("ordinal_month", OrdinalEncoder(categories=[month_order]))
    ])

# Day of Week Categorical Features
day_feature = ['day_of_week']

day_pipeline = Pipeline(steps=[
    ("ordinal_day", OrdinalEncoder(categories=[day_order]))
])



# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("num",  num_pipeline, num_features),
    ("cat",  cat_pipeline, cat_features),
    ("educ", educ_pipeline, education_feature),
    ("month", month_pipeline, month_feature),
    ("day", day_pipeline, day_feature)
])

In [None]:
# Creating a preprocessing and modelling pipepline (without taking into account )
model_xgb = Pipeline(steps=[("preprocessor",preprocessor),
                        ("xgb_cl", xgb.XGBClassifier())
                       ])

In [None]:
num_folds = 10
seed = 7
scoring = 'roc_auc'


# define evaluation procedure
split = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# evaluate model
scores = cross_val_score(model_xgb, X_train, y_train, cv=split ,scoring='roc_auc', n_jobs=-1)

# summarize performance
print('Mean ROC AUC: %.3f' % np.mean(scores))