# Importing libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor

# 讀取檔案

In [2]:
training_data = pd.read_csv('adult.data', sep=", ", header=None, engine='python')
test_data = pd.read_csv('adult.test', sep=", ", header=None, engine='python', skiprows=1)
training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
training_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
test_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
feature_cols = ['age','workclass','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','native-country','income']
training_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## 正規化

In [4]:
convert = {"income" :{"<=50K":0, ">50K":1}}
training_data = training_data.replace(convert)
convert = {"income" :{"<=50K.":0, ">50K.":1}}
test_data = test_data.replace(convert)

In [5]:
from sklearn.preprocessing import MinMaxScaler
# numerical_columns
num_columns = ['age','education-num','capital-gain','capital-loss']
scaler = MinMaxScaler()

training_data_scaled = pd.DataFrame(scaler.fit_transform(training_data[num_columns]), columns=num_columns)
training_data[num_columns] = training_data_scaled

test_data_scaled = pd.DataFrame(scaler.transform(test_data[num_columns]), columns=num_columns)
test_data[num_columns] = test_data_scaled

training_data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,189778.4,0.605379,0.010777,0.020042,40.437456,0.24081
std,0.186855,105550.0,0.171515,0.073854,0.092507,12.347429,0.427581
min,0.0,12285.0,0.0,0.0,0.0,1.0,0.0
25%,0.150685,117827.0,0.533333,0.0,0.0,40.0,0.0
50%,0.273973,178356.0,0.6,0.0,0.0,40.0,0.0
75%,0.424658,237051.0,0.733333,0.0,0.0,45.0,0.0
max,1.0,1484705.0,1.0,1.0,1.0,99.0,1.0


In [6]:
training_data.replace('?', 'other', inplace=True)
test_data.replace('?', 'other', inplace=True)

## get_dummies

In [7]:
training_data = pd.get_dummies(training_data)
test_data = pd.get_dummies(test_data)
feature_cols = training_data.columns
for column in training_data.columns:
    if column not in test_data.columns:
        test_data[column] = 0
        break

In [8]:
X = training_data.drop('hours-per-week',axis=1)
y = training_data['hours-per-week']
Xt = test_data.drop('hours-per-week',axis=1)
yt = test_data['hours-per-week']
X.columns = Xt.columns

## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor(n_neighbors=5)
k_range = list(range(5,31))
param_grid = dict(n_neighbors=k_range)
print(param_grid)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='r2', return_train_score=False)
grid.fit(X, y)
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

{'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}


Unnamed: 0,mean_test_score,std_test_score,params
0,-0.159895,0.012918,{'n_neighbors': 5}
1,-0.135608,0.009873,{'n_neighbors': 6}
2,-0.115079,0.00862,{'n_neighbors': 7}
3,-0.101368,0.006746,{'n_neighbors': 8}
4,-0.090569,0.008441,{'n_neighbors': 9}
5,-0.083301,0.009289,{'n_neighbors': 10}
6,-0.07613,0.008114,{'n_neighbors': 11}
7,-0.068985,0.008985,{'n_neighbors': 12}
8,-0.062792,0.008163,{'n_neighbors': 13}
9,-0.05955,0.007425,{'n_neighbors': 14}


In [11]:
print(grid.best_score_)
print(grid.best_params_)
params = grid.best_params_

-0.02822160857579723
{'n_neighbors': 30}


## MAPE、RMSE、R$^2$Score (original data)

In [12]:
from sklearn.metrics import mean_absolute_percentage_error
model = KNeighborsRegressor()
start_time = time.time()
model.fit(X, y)
print(f'Spend time: {time.time()-start_time}')
y_pred =  model.predict(Xt)
print(f'MAPE: {mean_absolute_percentage_error(yt,y_pred)}')
rmse = np.sqrt(metrics.mean_squared_error(yt,y_pred))
print(f'RMSE: {rmse}')
print(f'r2_score : {r2_score(yt,y_pred)}')

Spend time: 0.026996374130249023
MAPE: 0.4019430524956173
RMSE: 13.374350949226072
r2_score : -0.14865448085554522


## best MAPE、RMSE、R$^2$Score (use GridSearch)

In [13]:
from sklearn.metrics import mean_absolute_percentage_error
params = grid.best_params_
model = KNeighborsRegressor(**params)
start_time = time.time()
model.fit(X, y)
print(f'Spend time: {time.time()-start_time}')
y_pred =  model.predict(Xt)
print(f'MAPE: {mean_absolute_percentage_error(yt,y_pred)}')
rmse = np.sqrt(metrics.mean_squared_error(yt,y_pred))
print(f'RMSE: {rmse}')
print(f'r2_score : {r2_score(yt,y_pred)}')

Spend time: 0.04500842094421387
MAPE: 0.3747261340457038
RMSE: 12.645095693032761
r2_score : -0.026805583058889804
