In [1]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from time import time

In [2]:
data = pd.read_csv('banking.csv')

data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [3]:
# convert field of 'month'
dict_month = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
              'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
dict_day = {'sun': 1, 'mon': 2, 'tue': 3,
            'wed': 4, 'thu': 5, 'fri': 6, 'sat': 7}


# convert field of month
data['month'] = data['month'].map(dict_month)
# convert field of dayOfweek
data['day_of_week'] = data['day_of_week'].map(dict_day)

# conver binary fields
# default :
data.default.replace({'no': 0, 'yes': 1}, inplace=True)
# housing :
data.housing.replace({'no': 0, 'yes': 1}, inplace=True)
# loan :
data.loan.replace({'no': 0, 'yes': 1}, inplace=True)


# convert categories field by one host coding
marital_dummies = pd.get_dummies(data['marital'], prefix='marital')
marital_dummies.drop('marital_divorced', axis=1, inplace=True)
data = pd.concat([data, marital_dummies], axis=1)

job_dummies = pd.get_dummies(data['job'], prefix='job')
job_dummies.drop('job_unknown', axis=1, inplace=True)
data = pd.concat([data, job_dummies], axis=1)

education_dummies = pd.get_dummies(data['education'], prefix='education')
education_dummies.drop('education_unknown', axis=1, inplace=True)
data = pd.concat([data, education_dummies], axis=1)

contact_dummies = pd.get_dummies(data['contact'], prefix='contact')
# contact_dummies.drop('contact_unknown', axis=1, inplace=True)
data = pd.concat([data, contact_dummies], axis=1)

poutcome_dummies = pd.get_dummies(data['poutcome'], prefix='poutcome')
# poutcome_dummies.drop('poutcome_unknown', axis=1, inplace=True)
data = pd.concat([data, poutcome_dummies], axis=1)

data['pdays'] = data['pdays'].apply(lambda row: 0 if row == -1 else 1)
data.drop(['job', 'education', 'marital', 'contact',
          'poutcome'], axis=1, inplace=True)

data.head()

Unnamed: 0,age,default,housing,loan,month,day_of_week,duration,campaign,pdays,previous,...,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,contact_cellular,contact_telephone,poutcome_failure,poutcome_nonexistent,poutcome_success
0,44,unknown,1,0,8,5,210,1,1,0,...,False,False,False,False,False,True,False,False,True,False
1,53,0,0,0,11,6,138,1,1,0,...,False,False,False,False,False,True,False,False,True,False
2,28,0,1,0,6,5,339,3,1,2,...,False,False,False,False,True,True,False,False,False,True
3,39,0,0,0,4,6,185,2,1,0,...,False,True,False,False,False,True,False,False,True,False
4,55,0,1,0,8,6,137,1,1,1,...,False,False,False,False,False,True,False,False,False,True


In [4]:
data = data[data.apply(lambda row: not any(cell == 'unknown' for cell in row), axis=1)]
data.replace(True, 1, inplace=True)
data.replace(False, 0, inplace=True)
data.head()

Unnamed: 0,age,default,housing,loan,month,day_of_week,duration,campaign,pdays,previous,...,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,contact_cellular,contact_telephone,poutcome_failure,poutcome_nonexistent,poutcome_success
1,53,0,0,0,11,6,138,1,1,0,...,0,0,0,0,0,1,0,0,1,0
2,28,0,1,0,6,5,339,3,1,2,...,0,0,0,0,1,1,0,0,0,1
3,39,0,0,0,4,6,185,2,1,0,...,0,1,0,0,0,1,0,0,1,0
4,55,0,1,0,8,6,137,1,1,1,...,0,0,0,0,0,1,0,0,0,1
5,30,0,1,0,7,3,68,8,1,0,...,0,0,0,0,0,1,0,0,1,0


In [5]:
# Hàm sigmoid
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Gradient descent
def logistic_regression(X, y, learning_rate, iterations = 10000, tol=1e-4):
    m, n = X.shape
    theta = np.zeros(n)
    for i in range(iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)
        gradient = np.dot(X.T, (h - y)) / m
        if i % n == 0:
            if np.linalg.norm(theta - theta + learning_rate * gradient) < tol:
                return theta - learning_rate * gradient
        theta -= learning_rate * gradient
        
    return theta

In [6]:
X = data.drop("y", axis=1)
y = data['y']

In [7]:
# Thêm cột bias vào ma trận X
X = np.column_stack((np.ones(len(X)), X))

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
length = int(data.shape[0] * 0.8 // 1)
x_train, x_test = X[:length], X[length:]
y_train, y_test = y[:length], y[length:]

In [8]:
eta = .05
iterations = 10 ** 6

In [9]:
# Thực hiện gradient descent để tìm các tham số theta
w = logistic_regression(x_train, y_train, eta, iterations)

  return 1 / (1 + np.exp(-z))


In [None]:
# Dự đoán trên tập kiểm tra
z = np.dot(x_test, w)
h = sigmoid(z)
y_pred = np.where(h >= 0.75, 1, 0)

  return 1 / (1 + np.exp(-z))


In [None]:
# Đánh giá mô hình
accuracy = np.mean(y_pred == y_test)
print("Độ chính xác:", accuracy)

Độ chính xác: 0.5675463399308828


In [None]:
model = LogisticRegression(max_iter=iterations)

logis_time = time()
model.fit(x_train, y_train)
sk_y_pred = model.predict(x_test)
logis_end = time()

# Đánh giá mô hình
accuracy = np.mean(sk_y_pred == y_test)
print("Độ chính xác:", accuracy)
print("Thời gian chạy", logis_end - logis_time)

Độ chính xác: 0.9038642789820923
Độ chính xác: 0.9038642789820923
Thời gian chạy -4.054955720901489


In [None]:
df = pd.DataFrame({'Real label': y_test, 'My solution': y_pred, 'Sklearn': sk_y_pred})

print(df)