In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

from typing import List, Dict, Tuple, Any
import itertools


In [3]:
df = pd.read_csv("Dataset/bank-additional/bank-additional-full.csv", sep=';')

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
month_replace_dict = {'jan' : 1, 'feb' : 2, 'mar' : 3, 'apr' : 4, 'may' : 5, 'jun' : 6, 
                      'jul' : 7, 'aug' : 8, 'sep' : 9,  'oct' : 10, 'nov' : 11, 'dec' : 12}
day_replace_dict = {'mon' : 1, 'tue' : 2, 'wed' : 3, 'thu' : 4, 'fri' : 5}

In [6]:
df.replace('unknown', None, inplace=True)
df.dropna(axis=0, how='any', inplace=True)

In [7]:
# encoding ordinal data
df.month.replace(month_replace_dict, None, inplace=True)
df.day_of_week.replace(day_replace_dict, None, inplace=True)

education_category = ['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
boolean_category = ['no', 'yes']
encoder_ordinal = OrdinalEncoder(dtype=np.int64)
encoder_ordinal.categories_ = [education_category, boolean_category, boolean_category, boolean_category]
df[['education', 'default', 'housing', 'loan']] = encoder_ordinal.transform(df[['education', 'default', 'housing', 'loan']])


# encoding categorial data
encoder_categorical = OrdinalEncoder(dtype=np.int64)
df[['job', 'marital', 'contact', 'poutcome']] = encoder_categorical.fit_transform(df[['job', 'marital', 'contact', 'poutcome']])

In [8]:
df.drop_duplicates(inplace=True)
print(df.shape)
df.head()

(41174, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,1,0,0,0,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,no
1,57,7,1,4,0,0,0,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,no
2,37,7,1,4,0,1,0,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,no
3,40,0,1,2,0,0,0,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,no
4,56,7,1,4,0,0,1,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,no


In [9]:
print(f'There are {len(df[df.y == "yes"])} yes in y')
print(f'There are {len(df[df.y == "no"])} no in y')

There are 4639 yes in y
There are 36535 no in y


In [10]:
def find_correlated_params(wide_df: pd.DataFrame) -> List[str]:
    corr_df = wide_df.corr()
    param_corr_d = {row: [column for column in corr_df.columns if column != row and abs(corr_df.loc[row, column]) >= 0.9]
                    for row in corr_df.index}

    for_del = set()
    param_corr_d = {key: value for key, value in param_corr_d.items() if len(value) > 0}
    while len(param_corr_d) > 0:
        maximum, index = 0, list(param_corr_d.keys())[0]
        for row, columns in param_corr_d.items():
            number = len(set(columns) - for_del)
            if number > maximum:
                maximum = number
                index = row
        for_del.add(index)
        param_corr_d = {key: list(set(value) - for_del) for key, value in param_corr_d.items() if
                        key not in for_del if len(list(set(value) - for_del)) > 0}
    return list(for_del)


In [11]:
y_column = 'y'

In [12]:
for_del_params = find_correlated_params(df)
for_del_params

['euribor3m', 'emp.var.rate']

In [13]:
x_columns = list(set(df.columns) - set(for_del_params+[y_column]))

In [14]:
X, Y = df[x_columns], df[y_column]

In [15]:
scaler = StandardScaler()
scaler_column = list({'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'} -set(for_del_params))
X.loc[:, scaler_column] = scaler.fit_transform(X.loc[:, scaler_column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [16]:
X.head()

Unnamed: 0,previous,contact,poutcome,loan,month,education,pdays,marital,cons.price.idx,duration,campaign,job,cons.conf.idx,default,nr.employed,day_of_week,housing,age
0,0,1,1,0,5,1,999,1,0.722627,261,1,3,0.886609,0,0.331719,1,0,56
1,0,1,1,0,5,4,999,1,0.722627,149,1,7,0.886609,0,0.331719,1,0,57
2,0,1,1,0,5,4,999,1,0.722627,226,1,7,0.886609,0,0.331719,1,1,37
3,0,1,1,0,5,2,999,1,0.722627,151,1,0,0.886609,0,0.331719,1,0,40
4,0,1,1,1,5,4,999,1,0.722627,307,1,7,0.886609,0,0.331719,1,0,56


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=4)

In [18]:
# upsampling

# combined_df = pd.concat([X_train, y_train], axis=1)
# df_yes = combined_df[combined_df[y_column] == 'yes']
# df_no = combined_df[combined_df[y_column] == 'no']

# df_yes_upsampled = resample(df_yes,
#                           replace=True, 
#                           n_samples=len(df_no),
#                           random_state=27) 
# upsampled = pd.concat([df_yes_upsampled, df_no])
# X_train = upsampled.drop(y_column, axis=1)
# y_train = upsampled[y_column]

# del combined_df, df_yes, df_no, upsampled 

In [19]:
# downsampling

combined_df = pd.concat([X_train, y_train], axis=1)
df_yes = combined_df[combined_df[y_column] == 'yes']
df_no = combined_df[combined_df[y_column] == 'no']

df_no_downsampled = resample(df_no,
                          replace=False, 
                          n_samples=len(df_yes),
                          random_state=27) 
upsampled = pd.concat([df_yes, df_no_downsampled])
X_train = upsampled.drop(y_column, axis=1)
y_train = upsampled[y_column]

del combined_df, df_yes, df_no, upsampled 

In [20]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
scores = cross_val_score(knn, X, Y, cv=10, scoring='accuracy', n_jobs=6)
scoreKNN = scores.mean()

scoreKNN

0.8853146544203702

In [21]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8151350301146298
              precision    recall  f1-score   support

          no       0.97      0.82      0.89      9142
         yes       0.35      0.78      0.48      1152

    accuracy                           0.82     10294
   macro avg       0.66      0.80      0.69     10294
weighted avg       0.90      0.82      0.84     10294



In [22]:
logreg = LogisticRegression()
scores = cross_val_score(logreg, X, Y, cv=10, scoring='accuracy', n_jobs=6)
scoreLogReg = scores.mean()

scoreLogReg

0.8618996701979483

In [23]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8501068583640956
              precision    recall  f1-score   support

          no       0.98      0.85      0.91      9142
         yes       0.42      0.84      0.56      1152

    accuracy                           0.85     10294
   macro avg       0.70      0.85      0.73     10294
weighted avg       0.91      0.85      0.87     10294



In [24]:
gausian = GaussianNB()
scores = cross_val_score(gausian, X, Y, cv=10, scoring='accuracy', n_jobs=6)
scoreGausian = scores.mean()

scoreGausian

0.809700718528925

In [25]:
gausian.fit(X_train, y_train)
y_pred = gausian.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8291237614144161
              precision    recall  f1-score   support

          no       0.96      0.84      0.90      9142
         yes       0.36      0.70      0.48      1152

    accuracy                           0.83     10294
   macro avg       0.66      0.77      0.69     10294
weighted avg       0.89      0.83      0.85     10294



In [26]:
decision_tree = tree.DecisionTreeClassifier()
scores = cross_val_score(decision_tree, X, Y, cv=10, scoring='accuracy', n_jobs=6)
scoreTree = scores.mean()

scoreTree

0.6904924003495145

In [27]:
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8315523605984069
              precision    recall  f1-score   support

          no       0.97      0.83      0.90      9142
         yes       0.38      0.82      0.52      1152

    accuracy                           0.83     10294
   macro avg       0.68      0.83      0.71     10294
weighted avg       0.91      0.83      0.86     10294



In [28]:
def find_best_arguments(classifier, arguments: Dict[str, Any], X_text: pd.DataFrame, y_test:pd.Series) -> Tuple[Dict[str, Any],float]: 
    maximum_score = 0
    best_combo = None
    names = list(arguments.keys())
    for combo in itertools.product(*[arguments[name] for name in names ]):
        combo_dict = dict(zip(names, list(combo)))

        model = classifier(**combo_dict)
        model.fit(X_test, y_test)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        if score > maximum_score:
            best_combo = combo_dict
            maximum_score = score
    return best_combo, maximum_score

In [29]:
# def find_best_arguments(classifier, arguments: Dict[str, Any], X: pd.DataFrame, Y:pd.Series) -> Tuple[Dict[str, Any],float]: 
#     maximum_score = 0
#     best_combo = None
#     names = list(arguments.keys())
#     for combo in itertools.product(*[arguments[name] for name in names ]):
#         combo_dict = dict(zip(names, list(combo)))

#         model = classifier(**combo_dict)
#         scores = cross_val_score(model, X, Y, cv=10, scoring='accuracy', n_jobs=6)
#         score = scores.mean()
#         if score > maximum_score:
#             best_combo = combo_dict
#             maximum_score = score
#     return best_combo, maximum_score

In [30]:
%%time
arguments = {
            'n_neighbors':[3, 5, 10],
            'weights':['uniform', 'distance'],
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'leaf_size': [20, 30, 50],
            'p': [1, 2, 3],
            'n_jobs': [-1]}

best_combo_knn, best_score_knn = find_best_arguments(KNeighborsClassifier, arguments, X_test, y_test)
# best_combo_knn, best_score_knn = find_best_arguments(KNeighborsClassifier, arguments, X, Y)
best_score_knn

Wall time: 6min 47s


1.0

In [31]:
best_combo_knn

{'n_neighbors': 3,
 'weights': 'distance',
 'algorithm': 'ball_tree',
 'leaf_size': 20,
 'p': 1,
 'n_jobs': -1}

In [32]:
model = KNeighborsClassifier(**best_combo_knn)
scores = cross_val_score(model, X, Y, cv=10, scoring='accuracy', n_jobs=6)
score = scores.mean()
score

0.8629934305016821

In [33]:
%%time
arguments = {
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
            'n_jobs':[-1]}


best_combo_lreg, best_score_lreg = find_best_arguments(LogisticRegression, arguments, X_test, y_test)
# best_combo_lreg, best_score_lreg = find_best_arguments(LogisticRegression, arguments, X, Y)
best_score_lreg



Wall time: 3.08 s




0.909364678453468

In [34]:
best_combo_lreg

{'solver': 'newton-cg', 'n_jobs': -1}

In [35]:
model = LogisticRegression(**best_combo_lreg)
scores = cross_val_score(model, X, Y, cv=10, scoring='accuracy', n_jobs=6)
score = scores.mean()
score

0.8521353022442277