In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_selection import RFE, RFECV,mutual_info_classif
import numpy as np
from imblearn.under_sampling import NearMiss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer

In [4]:
data_frame=pd.read_csv('../data/raw/term-deposit-marketing-2020.csv', encoding = 'unicode_escape', sep=',', header=0)
data_frame.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [5]:
data_frame.describe(include='all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
count,40000.0,40000,40000,40000,40000,40000.0,40000,40000,40000,40000.0,40000,40000.0,40000.0,40000
unique,,12,3,4,2,,2,2,3,,11,,,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,no
freq,,9383,24386,20993,39191,,24031,33070,24914,,13532,,,37104
mean,40.5446,,,,,1274.27755,,,,16.017225,,254.8243,2.882175,
std,9.641776,,,,,2903.769716,,,,8.278127,,259.366498,3.239051,
min,19.0,,,,,-8019.0,,,,1.0,,0.0,1.0,
25%,33.0,,,,,54.0,,,,8.0,,100.0,1.0,
50%,39.0,,,,,407.0,,,,17.0,,175.0,2.0,
75%,48.0,,,,,1319.0,,,,21.0,,313.0,3.0,


In [6]:
data_frame.replace({'no':0, 'yes':1}, inplace=True)
data_frame['job'].replace({'management':1, 'technician':2, 'entrepreneur':3, 'blue-collar':4, 'unknown':0, 'retired':5, 'admin':6, 'services':7,'self-employed':8, 'unemployed':9, 'housemaid':10, 'student':11}, inplace=True)
data_frame['month'].replace({'may':5, 'jun':6, 'jul':7, 'aug':8, 'oct':10, 'nov':11, 'dec':12, 'jan':1, 'feb':2, 'mar':3, 'apr':4}, inplace=True)
data_frame.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,1,married,tertiary,0,2143,1,0,unknown,5,5,261,1,0
1,44,2,single,secondary,0,29,1,0,unknown,5,5,151,1,0
2,33,3,married,secondary,0,2,1,1,unknown,5,5,76,1,0
3,47,4,married,unknown,0,1506,1,0,unknown,5,5,92,1,0
4,33,0,single,unknown,0,1,0,0,unknown,5,5,198,1,0


In [7]:
encoded_data = pd.get_dummies(data_frame, columns = ['marital', 'contact','education'])
encoded_data.head()

Unnamed: 0,age,job,default,balance,housing,loan,day,month,duration,campaign,...,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,education_primary,education_secondary,education_tertiary,education_unknown
0,58,1,0,2143,1,0,5,5,261,1,...,False,True,False,False,False,True,False,False,True,False
1,44,2,0,29,1,0,5,5,151,1,...,False,False,True,False,False,True,False,True,False,False
2,33,3,0,2,1,1,5,5,76,1,...,False,True,False,False,False,True,False,True,False,False
3,47,4,0,1506,1,0,5,5,92,1,...,False,True,False,False,False,True,False,False,False,True
4,33,0,0,1,0,0,5,5,198,1,...,False,False,True,False,False,True,False,False,False,True


In [8]:
term_deposit = len(encoded_data[data_frame['y'] == 1])
no_term_deposit = len(encoded_data[data_frame['y']== 0])
class_distribution_ratio = term_deposit/no_term_deposit
print(encoded_data['y'].value_counts())
print(class_distribution_ratio)

y
0    37104
1     2896
Name: count, dtype: int64
0.07805088400172489


In [19]:
y = encoded_data['y']
features=encoded_data.drop(['y'],axis=1)
undersample = NearMiss(version = 2, n_neighbors = 5)
x,y = undersample.fit_resample(features,y)

In [20]:
X_train,X_test,y_train,y_test=train_test_split(x , y, test_size=0.4,random_state=10)

In [21]:
X_train.head()

Unnamed: 0,age,job,default,balance,housing,loan,day,month,duration,campaign,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,education_primary,education_secondary,education_tertiary,education_unknown
3524,51,4,0,201,1,0,17,7,582,2,False,True,False,True,False,False,True,False,False,False
134,26,1,0,14004,1,0,23,7,167,1,False,False,True,True,False,False,False,False,True,False
2621,49,7,0,4300,1,1,8,5,134,1,False,True,False,True,False,False,False,True,False,False
997,58,5,0,7560,0,0,12,8,364,2,False,True,False,True,False,False,False,True,False,False
2652,55,7,0,4263,0,0,14,8,93,2,False,True,False,True,False,False,False,True,False,False


In [22]:
classifiers = {
        'LogisticRegression': {
            'model': LogisticRegression(),
            'params': {
                'C': [0.1, 1],
                'penalty': ['l2']
            }
        },
        'KNeighborsClassifier': {
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [i for i in range(1, 5)],
                'weights': ['uniform', 'distance'],
                'p': [1, 2],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'leaf_size': [i for i in range(1, 3)]
            }
        },
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(),
            'params': {
                'max_depth': [10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'GaussianNB': {
            'model': GaussianNB(),
            'params': {}
        },
        'RandomForestClassifier': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [10, 50],
                'max_depth': [10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
    }

In [23]:
max_score = 0
best_model = None
for classifier_name, classifier in classifiers.items():
    model = classifier['model']
    params = classifier['params']
    kf = KFold(n_splits=5)
    scoring = make_scorer(f1_score, zero_division=1)
    grid_search = GridSearchCV(model, params, cv=kf, n_jobs=-1, scoring=scoring, verbose=0, return_train_score=True)
    grid_search.fit(X_train, y_train)

    if grid_search.best_score_ > max_score:
        max_score = grid_search.best_score_
        best_model = grid_search.best_estimator_

print(f"Estimator: {best_model}\nScore: {max_score:.3f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Estimator: RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=50)
Score: 0.952
