In [125]:
from __future__ import print_function
import numpy as np
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split # for splitting data
from sklearn import metrics

import pandas as pd

In [126]:
# Đọc dữ liệu từ csv
customerDF = pd.read_csv('./adult.csv')

In [127]:
customerDF.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [128]:
# Thay các giá trị ? thành nan

customerDF.replace('?',np.nan,inplace=True)
customerDF.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [129]:
# Số lượng các giá trị rỗng

customerDF.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [130]:
# Danh sách các features có nhiều giá trị rỗng
null_cols=['workclass','occupation','native-country']

In [131]:
# #function để xử lý null trong các cột phân loại theo chế độ lớp của nó
def handle_null(df,cols):
    for col in cols:
        df[col] = df.groupby("income")[col].transform(lambda x: x.fillna(x.mode()[0]))
    return df

In [132]:
customerDF = handle_null(customerDF,null_cols)
customerDF.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [133]:
# Xóa các hàng trùng lặp
customerDF = customerDF.drop_duplicates()

In [134]:
from sklearn import preprocessing
numeric_cols = customerDF.select_dtypes(include=[np.number]).columns

std = preprocessing.StandardScaler()
customerDF[numeric_cols] = std.fit_transform(customerDF[numeric_cols])

In [135]:
def replace_with_num(column, legend):
    customerDF[column] = [legend[val] for val in customerDF[column]]
    
def build_map(column):
    return { value: i for i, value in zip(range(len(column),-1,-1), column) }

In [136]:
# replace các giá trị object thành numeric

workclass_map = build_map(customerDF["workclass"])
education_map = build_map(customerDF["education"])
marital_status_map = build_map(customerDF["marital-status"])

occupation_map = build_map(customerDF["occupation"])
relationship_map = build_map(customerDF["relationship"])

race_map = build_map(customerDF["race"])
native_country_map = build_map(customerDF["native-country"])
gender_map = {"Male": 0, "Female": 1}
income_map = {"<=50K": 0, ">50K": 1}
income_feature_map = {0: "<=50K", 1: ">50K"}


replace_with_num("workclass", workclass_map)
replace_with_num("education", education_map)
replace_with_num("marital-status", marital_status_map)

replace_with_num("occupation", occupation_map)
replace_with_num("relationship", relationship_map)

replace_with_num("race", race_map)
replace_with_num("native-country", native_country_map)
replace_with_num("gender", gender_map)
replace_with_num("income", income_map)

In [137]:
customerDF

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995929,2,0.351583,39,-1.197969,2,4,2,20,0,-0.144884,-0.217251,-0.034367,1,0
1,-0.047609,2,-0.945440,1,-0.419771,1,131,4,1,0,-0.144884,-0.217251,0.772558,1,0
2,-0.777086,16,1.394491,5,0.747525,1,6,4,1,0,-0.144884,-0.217251,-0.034367,1,1
3,0.390077,2,-0.277850,6,-0.030672,1,4,4,20,0,0.886245,-0.217251,-0.034367,1,1
4,-1.506563,2,-0.815887,6,-0.030672,2,2,2,1,1,-0.144884,-0.217251,-0.841291,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.850034,2,0.640362,5,0.747525,1,5,1,1,1,-0.144884,-0.217251,-0.195751,1,0
48838,0.098286,2,-0.334176,1,-0.419771,1,4,4,1,0,-0.144884,-0.217251,-0.034367,1,1
48839,1.411345,2,-0.357505,1,-0.419771,3,2,3,1,1,-0.144884,-0.217251,-0.034367,1,0
48840,-1.214772,2,0.111926,1,-0.419771,2,2,2,1,0,-0.144884,-0.217251,-1.648216,1,0


In [138]:
# Danh sách các nhãn

labels = customerDF["income"].to_numpy()
labels

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [139]:
# Tập X

xDF = customerDF.drop(["income"], axis=1)
xDF

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,-0.995929,2,0.351583,39,-1.197969,2,4,2,20,0,-0.144884,-0.217251,-0.034367,1
1,-0.047609,2,-0.945440,1,-0.419771,1,131,4,1,0,-0.144884,-0.217251,0.772558,1
2,-0.777086,16,1.394491,5,0.747525,1,6,4,1,0,-0.144884,-0.217251,-0.034367,1
3,0.390077,2,-0.277850,6,-0.030672,1,4,4,20,0,0.886245,-0.217251,-0.034367,1
4,-1.506563,2,-0.815887,6,-0.030672,2,2,2,1,1,-0.144884,-0.217251,-0.841291,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.850034,2,0.640362,5,0.747525,1,5,1,1,1,-0.144884,-0.217251,-0.195751,1
48838,0.098286,2,-0.334176,1,-0.419771,1,4,4,1,0,-0.144884,-0.217251,-0.034367,1
48839,1.411345,2,-0.357505,1,-0.419771,3,2,3,1,1,-0.144884,-0.217251,-0.034367,1
48840,-1.214772,2,0.111926,1,-0.419771,2,2,2,1,0,-0.144884,-0.217251,-1.648216,1


In [140]:
# Chia tập dữ liệu thành train - test

X_train, X_test, y_train, y_test = train_test_split(xDF, labels, random_state=10)

In [141]:
# Xây dựng mô hình
model = naive_bayes.GaussianNB()
model.fit(X_train, y_train)

# Phân lớp
y_pred = model.predict(X_test)
# accuracy = metrics.accuracy_score(y_test, y_pred)
model.score(X_test, y_test)

0.8005410723069356

In [142]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [143]:
# Độ chính xác của mô hình trên tập test là 80.05%

print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2:", metrics.r2_score(y_test, y_pred))
print("F1_score:", metrics.f1_score(y_test, y_pred))

accuracy: 0.8005410723069356
R2: -0.07648182072220333
F1_score: 0.4670317634173055
