In [3]:
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics

In [4]:
df = pd.read_csv("banking.csv")
print(df.columns)
df


Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.200,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,unknown,no,yes,telephone,jun,thu,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.866,5228.1,0
41184,31,housemaid,married,basic.4y,unknown,no,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0,0
41185,42,admin.,single,university.degree,unknown,yes,yes,telephone,may,wed,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
41186,48,technician,married,professional.course,no,no,yes,telephone,oct,tue,...,2,999,0,nonexistent,-3.4,92.431,-26.9,0.742,5017.5,0


## Data preprocessing

In [5]:
month_names = {
    "jan":1,
    "feb":2,
    "mar":3,
    "apr":4,
    "may":5,
    "jun":6,
    "jul":7,
    "aug":8,
    "sep":9,
    "oct":10,
    "nov":11,
    "dec":12
}

date_name = {
    "mon":2,
    "tue":3,
    "wed":4,
    "thu":5,
    "fri":6,
    "sat":7,
    "sun":8
}

In [6]:
marital = pd.get_dummies(df["marital"], prefix="marital",dtype=int)
job = pd.get_dummies(df["job"], prefix="job",dtype=int)
education = pd.get_dummies(df["education"], prefix="education",dtype=int)
contact = pd.get_dummies(df["contact"], prefix="contact",dtype=int)
poutcome = pd.get_dummies(df["poutcome"], prefix="poutcome", dtype=int)
default = pd.get_dummies(df["default"], prefix="default", dtype=int)

housing = (df["housing"] == "yes").astype(int)
loan = (df["loan"] == "yes").astype(int)

mapped_month = df["month"].map(month_names)
mapped_dow = df["day_of_week"].map(date_name)

In [7]:
data = df.drop(columns=["marital", "job", "education", "contact", "poutcome", "housing", "loan", "month", "day_of_week", "default"])
data



Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,210,1,999,0,1.4,93.444,-36.1,4.963,5228.1,0
1,53,138,1,999,0,-0.1,93.200,-42.0,4.021,5195.8,0
2,28,339,3,6,2,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,185,2,999,0,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,137,1,3,1,-2.9,92.201,-31.4,0.869,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...
41183,59,222,1,999,0,1.4,94.465,-41.8,4.866,5228.1,0
41184,31,196,2,999,0,1.1,93.994,-36.4,4.860,5191.0,0
41185,42,62,3,999,0,1.1,93.994,-36.4,4.857,5191.0,0
41186,48,200,2,999,0,-3.4,92.431,-26.9,0.742,5017.5,0


In [8]:
data = pd.concat([job, marital, education, contact, poutcome, default, housing, loan, mapped_dow, mapped_month, data], axis=1)
data

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,0,1,0,0,0,0,0,0,0,0,...,210,1,999,0,1.4,93.444,-36.1,4.963,5228.1,0
1,0,0,0,0,0,0,0,0,0,1,...,138,1,999,0,-0.1,93.200,-42.0,4.021,5195.8,0
2,0,0,0,0,1,0,0,0,0,0,...,339,3,6,2,-1.7,94.055,-39.8,0.729,4991.6,1
3,0,0,0,0,0,0,0,1,0,0,...,185,2,999,0,-1.8,93.075,-47.1,1.405,5099.1,0
4,0,0,0,0,0,1,0,0,0,0,...,137,1,3,1,-2.9,92.201,-31.4,0.869,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,0,0,0,1,0,0,0,0,...,222,1,999,0,1.4,94.465,-41.8,4.866,5228.1,0
41184,0,0,0,1,0,0,0,0,0,0,...,196,2,999,0,1.1,93.994,-36.4,4.860,5191.0,0
41185,1,0,0,0,0,0,0,0,0,0,...,62,3,999,0,1.1,93.994,-36.4,4.857,5191.0,0
41186,0,0,0,0,0,0,0,0,0,1,...,200,2,999,0,-3.4,92.431,-26.9,0.742,5017.5,0


---

## Split train and test

In [9]:
np.random.seed(190203)

indexes = np.arange(data.shape[0])
train_indexes = np.random.choice(indexes, size=32949)
test_indexes = np.delete(indexes, train_indexes, axis=0)
print(np.sort(test_indexes)[:10])
print(np.sort(train_indexes)[:10])


[ 0  1  2  3  4  5  6  8  9 11]
[ 7 10 14 16 18 18 19 19 20 21]


In [10]:
train = data.iloc[train_indexes]
test = data.iloc[test_indexes]

X_train = train.iloc[:, :-1]
Y_train = train.iloc[:, -1:].to_numpy().flatten()

X_test = test.iloc[:, :-1]
Y_test = test.iloc[:, -1:].to_numpy().flatten()

## Logistic!!!!

In [28]:
import time

start_time = time.time()

logistic_regressor = linear_model.LogisticRegression(penalty=None)
logistic_regressor.fit(X_train, Y_train)

end_time = time.time()
runtime = end_time - start_time

print("Runtime:", runtime, "seconds")

print(logistic_regressor.coef_)

Runtime: 1.1253690719604492 seconds
[[ 3.96088571e-02 -7.24839062e-02 -7.75900870e-03 -7.68100551e-03
  -2.17672298e-03  3.08103786e-02 -1.99592681e-03 -2.34090795e-02
   1.90012014e-02  2.68585824e-02  2.07107300e-03  4.93905965e-04
   6.18376186e-04 -5.29859915e-02  5.55505672e-02  1.55396868e-04
  -1.94460404e-02 -1.64838018e-02 -3.96449264e-02 -1.57808943e-02
   7.72515711e-04  1.48369175e-02  6.36676595e-02  1.54169190e-02
   7.58413821e-02 -7.25030334e-02 -6.36084038e-02  5.25796062e-02
   1.43671463e-02  6.82360770e-02 -6.48411929e-02 -5.65354401e-05
  -4.57766298e-03 -9.65595206e-03 -5.32757334e-03 -3.60940651e-02
   7.90774450e-03  4.49699457e-03 -3.12201003e-02 -1.38366034e-03
  -5.24941570e-02 -2.12038186e-01  3.49945383e-01  4.97293294e-02
  -2.57909824e-01 -6.32078642e-03]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
Y_pred = logistic_regressor.predict(X_test)
Y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [13]:
acc = metrics.accuracy_score(Y_test, Y_pred)
precision = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
f1_score = metrics.f1_score(Y_test, Y_pred)

print(acc, precision, recall, f1_score)

0.9106419739754873 0.6680064308681672 0.40086830680173663 0.5010551703346398


## Naive Bayes!!!!

In [27]:
from sklearn.naive_bayes import GaussianNB
import time

start_time = time.time()

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, Y_train)

end_time = time.time()
runtime = end_time - start_time

print("Runtime:", runtime, "seconds")

y_pred_naive_bayes = naive_bayes.predict(X_test)

acc = metrics.accuracy_score(Y_test, y_pred_naive_bayes)
precision = metrics.precision_score(Y_test, y_pred_naive_bayes)
recall = metrics.recall_score(Y_test, y_pred_naive_bayes)
f1_score = metrics.f1_score(Y_test, y_pred_naive_bayes)

print("ACC", acc,"PRE", precision,"RECALL", recall,"F1", f1_score)

Runtime: 0.08183503150939941 seconds
ACC 0.8363479293774634 PRE 0.3544957472660996 RECALL 0.5629522431259045 F1 0.4350419384902144


Đánh giá: thời gian chạy của Logistic Regression cao hơn nhưng độ chính xác cũng cao hơn