In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [2]:
# 데이터 불러오기
train = pd.read_csv('~/Downloads/kakr-4th-competition/train.csv')
test = pd.read_csv('~/Downloads/kakr-4th-competition/test.csv')
sample_submission = pd.read_csv('~/Downloads/kakr-4th-competition/sample_submission.csv')

# 함수화

In [3]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 4) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 5) One-hot encoding은 만들지 않았다.

# 6) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 7) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 8) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

# 9) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = capital(df2)
    df4 = age(df3)
    
    df5 = pd.get_dummies(df4)
    
    df6 = edu(df5)
    df_fin = hpw(df6)
    
    return df_fin

In [4]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## 9) X, y split
X_train, y_train = target_handle(train)

In [5]:
# 데이터 전처리 기존과 동일
# 학습용 데이터 분할처리 (8:2)

from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                     test_size=.2,
                                                     random_state = 42,
                                                     shuffle=True,
                                                     stratify = y_train)

In [6]:
print(x_train.shape)
print(y_train.shape)
print('='*50)
print(x_valid.shape)
print(y_valid.shape)

(20839, 42)
(20839,)
(5210, 42)
(5210,)


# ML 모델 적용하기

In [7]:
# XGBoost 모델 사용

import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [8]:
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_valid)

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

f1 = f1_score(y_valid, y_pred, average='micro')
print('-F1 Score: ', f1)
# print(f"XGBClassifier\n -F1 Score: {f1_score(y_valid, y_pred, average='micro')}")

accuracy = accuracy_score(y_valid, y_pred)
print('-Accuracy score: ', accuracy)

-F1 Score:  0.87063339731286
-Accuracy score:  0.8706333973128599


### XGBoost 알고리즘의 개념 이해
XGBoost는 Gradient Boosting 알고리즘을 분산환경에서도 실행할 수 있도록 구현해놓은 라이브러리이다. 

즉, 앙상블 부스팅(ensemble boosting)의 특징인 가중치 부여를 경사하강법(gradient descent)으로 한다

* xgboost의 특징

    - gbm보다는 빠르다. (gbm보다 빠른 것입니다.)
    - 과적합(overfitting) 방지가 가능한 규제가 포함되어 있다.
    - CART(Classification And Regression Tree)를 기반으로 한다. 즉, 분류와 회귀가 둘 다 가능하다
    - 조기 종료(early stopping)을 제공한다.

#### xgboost의 하이퍼파라미터(xgboost hyperparameter)
https://xgboost.readthedocs.io/en/latest/parameter.html
    
- n_estimators(혹은 num_boost_round) : 결정 트리의 개수
- max_depth : 트리의 깊이
- colsample_bytree : 컬럼의 샘플링 비율(random forest의 max_features와 비슷)
- subsample : weak learner가 학습에 사용하는 데이터 샘플링 비율
- learning_rete : 학습률
- min_split_loss :  리프 노드를 추가적으로 나눌지 결정하는 값
- reg_lambda : L2 규제
- reg_alpha : L1 규제

In [18]:
xgb1 = xgb.XGBClassifier(
    learning_rate =0.05,
    n_estimators=1000,
    max_depth=8,
    min_child_weight=3,
    gamma=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1)

In [19]:
xgb1.fit(x_train, y_train)

y_pred1 = xgb1.predict(x_valid)
f1 = f1_score(y_valid, y_pred1, average='micro')
print(f1)

0.8698656429942418


In [29]:
xgb2 = xgb.XGBClassifier(booster='gbtree', 
    learning_rate =0.05,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=3,
    gamma=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    n_jobs=3,
    scale_pos_weight=1)

In [30]:
xgb2.fit(x_train, y_train)

y_pred1 = xgb2.predict(x_valid)
f1 = f1_score(y_valid, y_pred1, average='micro')
print(f1)

0.8700575815738963


### LightGBM

LightGBM 적용
https://nurilee.com/lightgbm-definition-parameter-tuning/
    
Light GBM은 Gradient Boosting 프레임워크로 Tree 기반 학습 알고리즘으로 Tree가 수직적으로 확장(leaf-wise)되는 방식이다.
확장하기 위해서 max delta loss를 가진 leaf를 선택하게 되는데, 
동일한 leaf를 확장할 때, leaf-wise 알고리즘은 level-wise 알고리즘(수평적 확장)보다 더 많은 loss, 손실을 줄일 수 있다.
LGBM은 또한 GPU 학습을 지원하기 때문에 속도가 빠르다.

단, Light GBM은 overfitting (과적합)에 민감하고 작은 데이터에 대해서 과적합하기 쉽기 때문에 적은 데이터에는 사용하지 않는 것을 권한다.


#### LightGBM 파라미터

https://lightgbm.readthedocs.io/en/latest/Parameters.html

* 더 빠른 속도
    - bagging_fraction
    - max_bin은 작게
    - save_binary를 쓰면 데이터 로딩속도가 빨라짐
    - parallel learning 사용

* 더 높은 정확도
    - max_bin을 크게
    - num_iterations 는 크게하고 learning_rate는 작게
    - num_leaves를 크게(과적합의 원인이 될 수 있음)
    - boosting 알고리즘 'dart' 사용

* 과적합을 줄이기
    - max_bin을 작게
    - num_leaves를 작게
    - min_data_in_leaf와 min_sum_hessian_in_leaf 사용하기

In [19]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(n_estimators=400)
lgb_model.fit(x_train, y_train)

LGBMClassifier(n_estimators=400)

In [20]:
pred = lgb_model.predict(x_valid)

f1 = f1_score(y_valid, pred, average='micro')
print('-F1 Score: ', f1)

-F1 Score:  0.8664107485604606


In [22]:
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
d_test = lgb.Dataset(x_valid, label=y_valid)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 16
params['is_training_metric'] = True
clf = lgb.train(params, d_train, 1000, d_test, verbose_eval=100, early_stopping_rounds=100)

y_pred = clf.predict(x_valid)

[LightGBM] [Info] Number of positive: 5044, number of negative: 15795
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 20839, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242046 -> initscore=-1.141494
[LightGBM] [Info] Start training from score -1.141494
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.483689
[200]	valid_0's binary_logloss: 0.438894
[300]	valid_0's binary_logloss: 0.406864
[400]	valid_0's binary_logloss: 0.384137
[500]	valid_0's binary_logloss: 0.367395
[600]	valid_0's binary_logloss: 0.35422
[700]	valid_0's binary_logloss: 0.343499
[800]	valid_0's binary_logloss: 0.335146
[900]	valid_0's binary_logloss: 0.328165
[1000]	valid_0's binary_logloss: 0.322308
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logl

In [24]:
# f1 = f1_score(y_valid, y_pred, average='micro')
# print('-F1 Score: ', f1)