# 1. Setup

In [1]:
# Import all required libraries
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


np.random.seed(1)

# 2. Load the Data 

In [2]:
import os
print(os.getcwd())

C:\Users\Gopi Chand\OneDrive - University of South Florida\Desktop\DSP\W4 DT


In [3]:
#Loding the data file
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head(20)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


# 3. Explore the given data

In [4]:
#see the column name
bank_df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [5]:
#Coverting all column name to Upper alphabtes and replacing space with "_"
bank_df.columns = [s.strip().upper().replace(' ', '_') for s in bank_df.columns] 
bank_df.columns

Index(['ID', 'AGE', 'EXPERIENCE', 'INCOME', 'ZIP_CODE', 'FAMILY', 'CCAVG',
       'EDUCATION', 'MORTGAGE', 'PERSONAL_LOAN', 'SECURITIES_ACCOUNT',
       'CD_ACCOUNT', 'ONLINE', 'CREDITCARD'],
      dtype='object')

In [6]:
#Some of the variables aren't predictors; therefore we drop them
# drop ID, and Zip Code as predictors
bank_df = bank_df.drop(columns=['ID', 'ZIP_CODE'])

In [7]:
# check for if there is any missing values in remaining columns
bank_df.isnull().sum()

AGE                   0
EXPERIENCE            0
INCOME                0
FAMILY                0
CCAVG                 0
EDUCATION             0
MORTGAGE              0
PERSONAL_LOAN         0
SECURITIES_ACCOUNT    0
CD_ACCOUNT            0
ONLINE                0
CREDITCARD            0
dtype: int64

In [8]:
#checking data type of each cloumns
bank_df.dtypes

AGE                     int64
EXPERIENCE              int64
INCOME                  int64
FAMILY                  int64
CCAVG                 float64
EDUCATION               int64
MORTGAGE                int64
PERSONAL_LOAN           int64
SECURITIES_ACCOUNT      int64
CD_ACCOUNT              int64
ONLINE                  int64
CREDITCARD              int64
dtype: object

only CCAVG column is float and remaining all are integers, we need to change fAMILY,EDUCATION,PERSONAL_LOAN, SECURITIES_ACCOUNT,CD_ACCOUNT,ONLINE and CREDITCARD as category data type

In [9]:
bank_df.FAMILY = bank_df.FAMILY.astype('category')
bank_df.SECURITIES_ACCOUNT = bank_df.SECURITIES_ACCOUNT.astype('category')
bank_df.EDUCATION = bank_df.EDUCATION.astype('category')
bank_df.CD_ACCOUNT = bank_df.CD_ACCOUNT.astype('category')
bank_df.ONLINE = bank_df.ONLINE.astype('category')
bank_df.CD_ACCOUNT = bank_df.CD_ACCOUNT.astype('category')
bank_df.CREDITCARD = bank_df.CREDITCARD.astype('category')
bank_df.PERSONAL_LOAN = bank_df.PERSONAL_LOAN.astype('category')

In [10]:
bank_df.dtypes

AGE                      int64
EXPERIENCE               int64
INCOME                   int64
FAMILY                category
CCAVG                  float64
EDUCATION             category
MORTGAGE                 int64
PERSONAL_LOAN         category
SECURITIES_ACCOUNT    category
CD_ACCOUNT            category
ONLINE                category
CREDITCARD            category
dtype: object

In [11]:
bank_df.describe()

Unnamed: 0,AGE,EXPERIENCE,INCOME,CCAVG,MORTGAGE
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,20.1046,73.7742,1.937938,56.4988
std,11.463166,11.467954,46.033729,1.747659,101.713802
min,23.0,-3.0,8.0,0.0,0.0
25%,35.0,10.0,39.0,0.7,0.0
50%,45.0,20.0,64.0,1.5,0.0
75%,55.0,30.0,98.0,2.5,101.0
max,67.0,43.0,224.0,10.0,635.0


In [12]:
bank_df.CD_ACCOUNT.value_counts()

0    4698
1     302
Name: CD_ACCOUNT, dtype: int64

By seeing above we can say that our data is imblanced

# 4. Split data (train/test)

In [13]:
# split the data into validation and training set
df_train, df_test = train_test_split(bank_df, test_size=0.3, random_state=1)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'CD_ACCOUNT'
predictors = list(bank_df.columns)
predictors.remove(target)

In [14]:
print(predictors)

['AGE', 'EXPERIENCE', 'INCOME', 'FAMILY', 'CCAVG', 'EDUCATION', 'MORTGAGE', 'PERSONAL_LOAN', 'SECURITIES_ACCOUNT', 'ONLINE', 'CREDITCARD']


In [15]:
print(target)

CD_ACCOUNT


# 5. Standardize numeric values
Now, let's create a common scale between the numberic columns by standardizing each numeric column

In [16]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
scaler.fit(df_train[predictors])

# Transform the predictors of training and test sets
train_predictors = scaler.transform(df_train[predictors]) # train_predictors is not a numpy array
train_target = df_train[target] # train_target is now a series object

validation_predictors = scaler.transform(df_test[predictors]) # validation_target is now a series object
validation_target = df_test[target] # validation_target is now a series object

In [17]:
X_train = df_train[predictors]
y_train = df_train[target]
X_test = df_test[predictors]
y_test = df_test[target]

#Address any data imbalances

In [18]:
#We will utilize an oversamplying technique to address any necessary date balancing.
from imblearn.over_sampling import RandomOverSampler
target = 'CD_ACCOUNT'
predictors = list(bank_df.columns)
predictors.remove(target)
RanOverSample=RandomOverSampler(sampling_strategy=1)
pred,tar = RanOverSample.fit_resample(df_train[predictors],df_train[target])
tar.value_counts()

0    3281
1    3281
Name: CD_ACCOUNT, dtype: int64

# 6.0 Model the data
First, we will create a dataframe to hold all the results of our models.

In [19]:
#performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

# 6.1 Fit a LogisticRegression model with Random Search

In [20]:
score_measure = "recall"
kfolds = 10
param_grid = { 'solver': [ 'liblinear', 'saga'],
                      'penalty': ['l1', 'l2'], # NOTE: 'elasticnet' is only supported by 'saga' solver
                      'C': [100, 10, 1.0, 0.1, 0.01],
                      # number of iterations to converge (sometimes the default is not enough - and sometimes, it will never converge)
                     }
logi_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = logi_reg, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_logi_reg_rand = rand_search.best_estimator_



Fitting 10 folds for each of 20 candidates, totalling 200 fits
The best recall score is 0.6662337662337662
... with parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 100}


In [21]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.2 Fit a LogisticRegression model with Grid Search

In [22]:
score_measure = "recall"
kfolds = 10

penalty= rand_search.best_params_['penalty']
solver =rand_search.best_params_['solver']

param_grid = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'penalty': [penalty],
    'solver': [solver]
}

logi_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = logi_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 10 folds for each of 5 candidates, totalling 50 fits
The best recall score is 0.6662337662337662
... with parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [23]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.1 Fit a SVM classification model using linear kernal with Random Search

In [24]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'C': np.arange(5,15)
}

svm_linear_model = SVC(kernel="linear")
rand_search = RandomizedSearchCV(estimator = svm_linear_model, param_distributions=param_grid, cv=kfolds, n_iter=15,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits




The best recall score is 0.6662790697674419
... with parameters: {'C': 100}


In [25]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.2 Fit a SVM classification model using linear kernal with Grid Search

In [26]:
score_measure = "recall"
kfolds = 2

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+1,C,C-1]
}

svm_linear_model = SVC(kernel="linear")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.6665971643035863
... with parameters: {'C': 110}


In [27]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# # 6.5 Fit a SVM classification model using Polynomial kernal with Random Search

In [None]:
score_measure = "recall"
kfolds = 5

param_rand = {
    'C': np.arange(5,15),
    'degree': [3, 4,5],
    'gamma': ['scale', 'auto'],
    'coef0': np.arange(1,5)
}

svm_poly_model = SVC(kernel="poly")
rand_search = RandomizedSearchCV(estimator = svm_poly_model, param_distributions=param_rand, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train,np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_poly = rand_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

# # 6.5 Fit a SVM classification model using Polynomial kernal with Grid Search

In [None]:
score_measure = "recall"
kfolds = 5

degree = rand_search.best_params_['degree']
gamma = rand_search.best_params_['gamma']
coef0 = rand_search.best_params_['coef0']
C = rand_search.best_params_['C']
param_grid = {
    'C': [C-1,C,C+1],
    'degree': np.arange(degree-2,degree+2),
    'gamma': [gamma],
    'coef0': np.arange(coef0-2,coef0+2),
}

svm_poly_model = SVC(kernel="poly")
grid_search = GridSearchCV(estimator = svm_poly_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_poly = grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

# 6.6 Fit a Decision Tree Classifier model with Random Search

In [None]:
# Criterion used to guide data splits
criterion = ['gini', 'entropy']

# Maximum number of levels in tree. If None, then nodes are expanded until all leaves are pure or until all 
# leaves contain less than min_samples_split samples.
# default = None
max_depth = [int(x) for x in np.linspace(1, 40000, 50)]
max_depth.append(None)

# Minimum number of samples required to split a node
# default is 2
min_samples_split = [int(x) for x in np.linspace(2, 5000, 50)]

# Minimum number of samples required at each leaf node
# default = 1 
min_samples_leaf = [int(x) for x in np.linspace(1, 10000, 50)]

# max_leaf_nodes  - Grow trees with max_leaf_nodes in best-first fashion.
# If None then unlimited number of leaf nodes.
# default=None 
max_leaf_nodes = [int(x) for x in np.linspace(2, len(y_test), 50)]
max_leaf_nodes.append(None)

# min_impurity_decrease - A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# default=0.0
min_impurity_decrease = [x for x in np.arange(0.0, 0.01, 0.0001).round(5)]

# Create the random grid
param_grid_random = { 'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                     }



In [None]:
dtree_default = DecisionTreeClassifier()

best_random_search_model = RandomizedSearchCV(
        estimator=DecisionTreeClassifier(), 
        scoring='recall', 
        param_distributions=param_grid_random, 
        n_iter = 300, 
        cv=10, 
        verbose=1, 
        n_jobs = -1
    )
_ = best_random_search_model.fit(X_train, np.ravel(y_train))

In [None]:
random_search_best_params = best_random_search_model.best_params_
print('Best parameters found: ', random_search_best_params)

In [None]:
y_pred = best_random_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

In [None]:
#The best parameters found using RandomizedSearchCV were:
random_search_best_params

Let's now use these current best parameters as a starting point for a more refined grid search. We'll use the same parameters as before, but we'll use a much smaller range of values for each parameter.

# 6.7 Fit a Decision Tree Classifier model with Grid Search

In [None]:
plus_minus = 10 
increment = 2

param_grid = { 'min_samples_split': [x for x in range(random_search_best_params['min_samples_split']-plus_minus, random_search_best_params['min_samples_split']+plus_minus,2) if x >= 2],       
              'min_samples_leaf': [x for x in range(random_search_best_params['min_samples_leaf']-plus_minus , random_search_best_params['min_samples_leaf']+plus_minus,2) if x > 0],
              'min_impurity_decrease': [x for x in np.arange(random_search_best_params['min_impurity_decrease']-0.001, random_search_best_params['min_impurity_decrease']+0.001,.0001).round(5) if x >= 0.000],
              'max_leaf_nodes':[x for x in range(random_search_best_params['max_leaf_nodes']-plus_minus , random_search_best_params['max_leaf_nodes']+plus_minus, 2) if x > 1],  
              'max_depth': [x for x in range(random_search_best_params['max_depth']-plus_minus , random_search_best_params['max_depth']+plus_minus, 2) if x > 1],
              'criterion': [random_search_best_params['criterion']]
              }

best_grid_search_model = GridSearchCV(estimator=DecisionTreeClassifier(), 
                                    scoring='recall', param_grid=param_grid, cv=5, verbose=1,  n_jobs = -1)
_ = best_grid_search_model.fit(X_train, y_train)

In [None]:
print('Best parameters found: ', best_grid_search_model.best_params_)

In [None]:
y_pred = best_grid_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

In [None]:
print("************************************")
print("************************************")
print("************************************")
print("*******Enter the Recall score of all models*****************************")