# Importing libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# 讀取檔案

In [2]:
training_data = pd.read_csv('adult.data', sep=", ", header=None, engine='python')
test_data = pd.read_csv('adult.test', sep=", ", header=None, engine='python', skiprows=1)
training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
training_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
test_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
feature_cols = ['age','workclass','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','native-country','income']
training_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Data Cleaning

In [4]:
## education跟education-num數量一樣所以取education-num即可，故刪除education
training_data = training_data.drop(columns=['education'])
test_data = test_data.drop(columns=['education'])

## fnlwgt 是ID，故刪除
training_data = training_data.drop(columns=['fnlwgt'])
test_data = test_data.drop(columns=['fnlwgt'])

## get_dummies

In [5]:
convert = {"income" :{"<=50K":0, ">50K":1},"sex":{"Male":1,"Female":0}}
training_data = training_data.replace(convert)
convert = {"income" :{"<=50K.":0, ">50K.":1},"sex":{"Male":1,"Female":0}}
test_data = test_data.replace(convert)
training_data

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,0,0,0,38,United-States,0
32557,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,1,0,0,40,United-States,1
32558,58,Private,9,Widowed,Adm-clerical,Unmarried,White,0,0,0,40,United-States,0
32559,22,Private,9,Never-married,Adm-clerical,Own-child,White,1,0,0,20,United-States,0


In [6]:
from sklearn.preprocessing import MinMaxScaler
# numerical_columns
num_columns = ['age','education-num','capital-gain','capital-loss']
scaler = MinMaxScaler()

training_data_scaled = pd.DataFrame(scaler.fit_transform(training_data[num_columns]), columns=num_columns)
training_data[num_columns] = training_data_scaled

test_data_scaled = pd.DataFrame(scaler.transform(test_data[num_columns]), columns=num_columns)
test_data[num_columns] = test_data_scaled

training_data.describe()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.605379,0.669205,0.010777,0.020042,40.437456,0.24081
std,0.186855,0.171515,0.470506,0.073854,0.092507,12.347429,0.427581
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.150685,0.533333,0.0,0.0,0.0,40.0,0.0
50%,0.273973,0.6,1.0,0.0,0.0,40.0,0.0
75%,0.424658,0.733333,1.0,0.0,0.0,45.0,0.0
max,1.0,1.0,1.0,1.0,1.0,99.0,1.0


In [7]:
training_data.replace('?', 'other', inplace=True)
test_data.replace('?', 'other', inplace=True)

In [8]:
training_data = pd.get_dummies(training_data)
test_data = pd.get_dummies(test_data)
feature_cols = training_data.columns
for column in training_data.columns:
    if column not in test_data.columns:
        test_data[column] = 0
        break

In [9]:
X = training_data.drop('hours-per-week',axis=1)
y = training_data['hours-per-week']
Xt = test_data.drop('hours-per-week',axis=1)
yt = test_data['hours-per-week']
X.columns = Xt.columns

## Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV
# # 初始模型 (官方默認)
# model = XGBRegressor(booster='gbtree',
#                     objective= 'reg:linear',
#                     eval_metric='rmse',
#                     gamma = 0.1,
#                     min_child_weight= 1.1, 
#                     max_depth= 5,
#                     subsample= 0.8,
#                     colsample_bytree= 0.8,
#                     tree_method= 'exact',
#                     learning_rate=0.1,
#                     n_estimators=100,
#                     nthread=4,
#                     scale_pos_weight=1,
#                     seed=27)

model = XGBRegressor()
param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

grid = GridSearchCV(model, param_grid, scoring='r2')
grid.fit(X,y)
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.276385,0.018553,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
1,0.275977,0.018496,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
2,0.273514,0.017587,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
3,0.273128,0.01958,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
4,0.270011,0.018333,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
5,0.265049,0.016901,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
6,0.269542,0.019943,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
7,0.264771,0.018771,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
8,0.254101,0.018112,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."


In [12]:
print(grid.best_score_)
print(grid.best_params_)
params = grid.best_params_

0.2763848067913053
{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


## MAPE、RMSE、R$^2$Score

In [13]:
from sklearn.metrics import mean_absolute_percentage_error
model = XGBRegressor()
start_time = time.time()
model.fit(X, y)
print(f'Spend time: {time.time()-start_time}')
y_pred =  model.predict(Xt)
print(f'MAPE: {mean_absolute_percentage_error(yt,y_pred)}')
rmse = np.sqrt(metrics.mean_squared_error(yt,y_pred))
print(f'RMSE: {rmse}')
print(f'r2_score : {r2_score(yt,y_pred)}')

Spend time: 0.4990239143371582
MAPE: 0.32595092933186154
RMSE: 11.24092665634507
r2_score : 0.1885753268566719


## best MAPE、RMSE、R$^2$Score (use GridSearch)

In [14]:
from sklearn.metrics import mean_absolute_percentage_error
params = grid.best_params_
model = XGBRegressor(**params)
start_time = time.time()
model.fit(X, y)
print(f'Spend time: {time.time()-start_time}')
y_pred =  model.predict(Xt)
print(f'MAPE: {mean_absolute_percentage_error(yt,y_pred)}')
rmse = np.sqrt(metrics.mean_squared_error(yt,y_pred))
print(f'RMSE: {rmse}')
print(f'r2_score : {r2_score(yt,y_pred)}')

Spend time: 2.0160951614379883
MAPE: 0.28970323072890997
RMSE: 10.671748547983833
r2_score : 0.2686670376344005
