### GBM - Gradient Boosting Machine
부스팅 알고리즘은 여러개의 약한 학습기를 순차적으로 학습-예측 하면서 잘못 예측한 데이터에 가중치를 부여해서 오류를 개선해가는 방식<p>
GBM은 가중치를 업데이트 할때 경사 하강법을 이용하는 것이 큰 차이 


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'
feature_name_df = pd.read_csv(
    url, sep='\s+', header=None, names=['column_index', 'column_name'])

feature_name_df.head()

Unnamed: 0,column_index,column_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [3]:
feature_name=feature_name_df.iloc[:,1].values.tolist()
feature_name[:10]

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X']

In [4]:
X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt'

X_train = pd.read_csv(X_train_url, sep='\s+', header=None)
X_test = pd.read_csv(X_test_url, sep='\s+', header=None)


In [5]:
X_train.columns = feature_name
X_test.columns = feature_name

X_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [6]:

y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt'


y_train = pd.read_csv(y_train_url, sep='\s+', header=None, names=['action'])
y_test = pd.read_csv(y_test_url, sep='\s+', header=None, names=['action'])


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings

warnings.filterwarnings('ignore')

In [10]:
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)

print('ACC : ',accuracy_score(y_test, gb_pred))
print('Fit time : ',time.time()-start_time)

ACC :  0.9385816084153377
Fit time :  1677.279715538025


In [11]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100, 500],
    'learning_rate' : [0.05, 0.1]
}

start_time = time.time()
grid = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1, n_jobs=1) # verbose는 실행 과정을 출력하도록 설정
grid.fit(X_train, y_train)

print('Fit time : ', time.time() - start_time)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


KeyboardInterrupt: 

In [12]:
conda install xgboost

Retrieving notices: ...working... done
Note: you may need to restart the kernel to use updated packages.




  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0





Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: c:\Users\YUN\miniconda3\envs\ds_study

  added / updated specs:
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0          12 KB
    certifi-2023.5.7           |   py38haa95532_0         153 KB
    joblib-1.2.0               |   py38haa95532_0         388 KB
    libxgboost-1.7.3           |       hd77b12b_0         1.5 MB
    py-xgboost-1.7.3           |   py38haa95532_0         197 KB
    scikit-learn-1.2.2         |   py38hd77b12b_1         6.5 MB
    scipy-1.10.1               |   py38hdcfc7df_1        18.7 MB
    xgboost-1.7.3              |   py38haa95532_0          12 KB
    ------------------------------------------------------------
                                 

In [16]:
y_train

Unnamed: 0,action
0,5
1,5
2,5
3,5
4,5
...,...
7347,2
7348,2
7349,2
7350,2


In [15]:
from xgboost import XGBClassifier

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(X_train.values, y_train)


print('Fit time ',time.time() - start_time)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5], got [1 2 3 4 5 6]

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_label = le.fit_transform(y_train)

from xgboost import XGBClassifier

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(X_train.values, y_train_label)


print('Fit time ',time.time() - start_time)

KeyboardInterrupt: 

#### 조기 종료 조건과 감즌데이터 지정

In [24]:
from xgboost import XGBClassifier

evals = [(X_test.values, y_test)]

start_time = time.time()
xgb = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print('Fit time ',time.time() - start_time)


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5], got [1 2 3 4 5 6]

### LightGBM
LightGBM은 Microsoft에서 개발된 그래디언트 부스팅 기반의 경량화된 머신 러닝 프레임워크입니다. 

LightGBM은 대용량의 데이터셋에서도 빠른 학습과 예측 속도를 제공하며, 정확도 또한 높은 성능을 보입니다.

In [21]:
conda install lightgbm

Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.




  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0





Solving environment: ...working... done

## Package Plan ##

  environment location: c:\Users\YUN\miniconda3\envs\ds_study

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lightgbm-3.3.5             |   py38hd77b12b_0         850 KB
    ------------------------------------------------------------
                                           Total:         850 KB

The following NEW packages will be INSTALLED:

  lightgbm           pkgs/main/win-64::lightgbm-3.3.5-py38hd77b12b_0 



Downloading and Extracting Packages

lightgbm-3.3.5       | 850 KB    |            |   0% 
lightgbm-3.3.5       | 850 KB    | 1          |   2% 
lightgbm-3.3.5       | 850 KB    | ###7       |  38% 
lightgbm-3.3.5       | 850 KB    | ######9    |  70% 
lightgbm-3.3.5       | 850 KB    | ########## | 100% 
lightgbm-3.3.5       | 850 KB    | ########## | 100% 
            

In [26]:
from lightgbm import LGBMClassifier

start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
lgbm.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print('Fit time ',time.time() - start_time)


[1]	valid_0's multi_logloss: 1.46247
[2]	valid_0's multi_logloss: 1.24399
[3]	valid_0's multi_logloss: 1.07748
[4]	valid_0's multi_logloss: 0.949568
[5]	valid_0's multi_logloss: 0.844182
[6]	valid_0's multi_logloss: 0.758642
[7]	valid_0's multi_logloss: 0.687153
[8]	valid_0's multi_logloss: 0.628245
[9]	valid_0's multi_logloss: 0.575191
[10]	valid_0's multi_logloss: 0.533064
[11]	valid_0's multi_logloss: 0.493877
[12]	valid_0's multi_logloss: 0.463042
[13]	valid_0's multi_logloss: 0.434078
[14]	valid_0's multi_logloss: 0.405504
[15]	valid_0's multi_logloss: 0.38303
[16]	valid_0's multi_logloss: 0.362965
[17]	valid_0's multi_logloss: 0.346311
[18]	valid_0's multi_logloss: 0.330217
[19]	valid_0's multi_logloss: 0.316604
[20]	valid_0's multi_logloss: 0.303022
[21]	valid_0's multi_logloss: 0.291593
[22]	valid_0's multi_logloss: 0.278654
[23]	valid_0's multi_logloss: 0.270054
[24]	valid_0's multi_logloss: 0.260458
[25]	valid_0's multi_logloss: 0.252038
[26]	valid_0's multi_logloss: 0.244146

In [27]:
print(accuracy_score(y_test, lgbm.predict(X_test.values)))

0.9440108585001696
