# Bài kiểm tra số 3
Sử dụng tập dữ liệu trên Kaggle để xây dựng mô hình phân lớp bằng ANN có độ chính xác trên 80%
## Yêu cầu:
- Viết comment cụ thể trong file Code
- Nộp thêm 1 file txt giải thích kết quả
## Sinh viên thực hiện:
Họ tên: Lý Hồng Phát

MSSV: 20110692

# Nhập dữ liệu

In [130]:
# import thư viện
from __future__ import print_function
import numpy as np
from sklearn.model_selection import train_test_split # for splitting data
from sklearn import metrics # để đánh giá mô hình

import pandas as pd


In [131]:
# Đọc dữ liệu từ csv
customerDF = pd.read_csv('./adult.csv')

# Xử lý dữ liệu

In [132]:
# Xem 5 dòng đầu tiên
customerDF.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [133]:
# Thay các giá trị ? thành nan

customerDF.replace('?',np.nan,inplace=True)
customerDF.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [134]:
# tổng số lượng các giá trị rỗng trong từng cột
customerDF.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [135]:
# Danh sách các features có nhiều giá trị rỗng
null_cols=['workclass','occupation','native-country']

In [136]:
# function để xử lý null trong các cột phân loại theo chế độ lớp của nó
def handle_null(df,cols):
    for col in cols:
        df[col] = df.groupby("income")[col].transform(lambda x: x.fillna(x.mode()[0]))
        # Transform dữ liệu từng cột được groupby theo cột income
        # fill các giá trị null bởi giá trị xuất hiện thường xuyên nhất
    return df

In [137]:
# Chạy function xử lý null
customerDF = handle_null(customerDF,null_cols)
customerDF.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [138]:
# Xóa các hàng trùng lặp
customerDF = customerDF.drop_duplicates()

In [139]:
# import thư viện tiền xử lý dữ liệu
from sklearn import preprocessing

# Chọn các cột có kiểu dữ liệu số
numeric_cols = customerDF.select_dtypes(include=[np.number]).columns

# Chuẩn hóa dữ liệu các dòng dữ liệu số
std = preprocessing.StandardScaler()
customerDF[numeric_cols] = std.fit_transform(customerDF[numeric_cols])

In [140]:
# Khởi tạo các hàm mapping và thay thế để chuyển các giá trị chữ thành số

def replace_with_num(column, legend):
    customerDF[column] = [legend[val] for val in customerDF[column]]
    
def build_map(column):
    return { value: i for i, value in zip(range(len(column),-1,-1), column) }

In [141]:
# excute hàm mapping and replace với từng cột và giá trị tùy chỉnh cụ thể

workclass_map = build_map(customerDF["workclass"])
education_map = build_map(customerDF["education"])
marital_status_map = build_map(customerDF["marital-status"])

occupation_map = build_map(customerDF["occupation"])
relationship_map = build_map(customerDF["relationship"])

race_map = build_map(customerDF["race"])
native_country_map = build_map(customerDF["native-country"])
gender_map = {"Male": 0, "Female": 1}
income_map = {"<=50K": 0, ">50K": 1}
income_feature_map = {0: "<=50K", 1: ">50K"}


replace_with_num("workclass", workclass_map)
replace_with_num("education", education_map)
replace_with_num("marital-status", marital_status_map)

replace_with_num("occupation", occupation_map)
replace_with_num("relationship", relationship_map)

replace_with_num("race", race_map)
replace_with_num("native-country", native_country_map)
replace_with_num("gender", gender_map)
replace_with_num("income", income_map)

In [142]:
# Xem thử dữ liệu
customerDF

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995929,2,0.351583,39,-1.197969,2,4,2,20,0,-0.144884,-0.217251,-0.034367,1,0
1,-0.047609,2,-0.945440,1,-0.419771,1,131,4,1,0,-0.144884,-0.217251,0.772558,1,0
2,-0.777086,16,1.394491,5,0.747525,1,6,4,1,0,-0.144884,-0.217251,-0.034367,1,1
3,0.390077,2,-0.277850,6,-0.030672,1,4,4,20,0,0.886245,-0.217251,-0.034367,1,1
4,-1.506563,2,-0.815887,6,-0.030672,2,2,2,1,1,-0.144884,-0.217251,-0.841291,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.850034,2,0.640362,5,0.747525,1,5,1,1,1,-0.144884,-0.217251,-0.195751,1,0
48838,0.098286,2,-0.334176,1,-0.419771,1,4,4,1,0,-0.144884,-0.217251,-0.034367,1,1
48839,1.411345,2,-0.357505,1,-0.419771,3,2,3,1,1,-0.144884,-0.217251,-0.034367,1,0
48840,-1.214772,2,0.111926,1,-0.419771,2,2,2,1,0,-0.144884,-0.217251,-1.648216,1,0


In [143]:
# numpy array danh sách các nhãn

labels = customerDF["income"].to_numpy()
labels

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [144]:
# Tập X

xDF = customerDF.drop(["income"], axis=1)
xDF

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,-0.995929,2,0.351583,39,-1.197969,2,4,2,20,0,-0.144884,-0.217251,-0.034367,1
1,-0.047609,2,-0.945440,1,-0.419771,1,131,4,1,0,-0.144884,-0.217251,0.772558,1
2,-0.777086,16,1.394491,5,0.747525,1,6,4,1,0,-0.144884,-0.217251,-0.034367,1
3,0.390077,2,-0.277850,6,-0.030672,1,4,4,20,0,0.886245,-0.217251,-0.034367,1
4,-1.506563,2,-0.815887,6,-0.030672,2,2,2,1,1,-0.144884,-0.217251,-0.841291,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.850034,2,0.640362,5,0.747525,1,5,1,1,1,-0.144884,-0.217251,-0.195751,1
48838,0.098286,2,-0.334176,1,-0.419771,1,4,4,1,0,-0.144884,-0.217251,-0.034367,1
48839,1.411345,2,-0.357505,1,-0.419771,3,2,3,1,1,-0.144884,-0.217251,-0.034367,1
48840,-1.214772,2,0.111926,1,-0.419771,2,2,2,1,0,-0.144884,-0.217251,-1.648216,1


In [145]:
# Chia tập dữ liệu thành train - test

X_train, X_test, y_train, y_test = train_test_split(xDF, labels, random_state=10)

# Xây dựng mô hình và phân lớp

In [146]:
# Import thư viện 
from sklearn.neural_network import MLPClassifier

# Tạo model
alpha = 1e-1 # regulation parameter
model = MLPClassifier(alpha=alpha, hidden_layer_sizes=10, solver='lbfgs')

# Fit mô hình với tập dữ liệu
model.fit(X_train, y_train)

# Phân lớp
y_pred = model.predict(X_test)

# Độ chính xác mô hình
model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.8254631906869979

In [147]:
# Xem tập đầu ra
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# Đánh giá

In [148]:
# Độ chính xác của mô hình trên tập test là 82.54%

print("accuracy:", metrics.accuracy_score(y_test, y_pred))
# print("R2:", metrics.r2_score(y_test, y_pred))
# print("F1_score:", metrics.f1_score(y_test, y_pred))

accuracy: 0.8254631906869979
