# 📌 GBM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### 1. 실습 - 사용자 행동 데이터 세트

#### 1.1 데이터 불러오기

In [2]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),
                                  columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
                                                                                         if x[1] >0 else x[0] ,  axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [3]:
feature_name_df = pd.read_csv('./human_activity/features.txt',sep='\s+',
                        header=None,names=['column_index','column_name'])

# 중복된 피처명을 수정하는 get_new_feature_name_df()를 이용, 신규 피처명 DataFrame생성. 
new_feature_name_df = get_new_feature_name_df(feature_name_df)

# DataFrame에 피처명을 컬럼으로 부여하기 위해 리스트 객체로 다시 변환
feature_name = new_feature_name_df.iloc[:, 1].values.tolist()

In [4]:
X_train = pd.read_csv('./human_activity/X_train.txt',sep='\s+', names=feature_name )
X_test = pd.read_csv('./human_activity/X_test.txt',sep='\s+', names=feature_name)

y_train = pd.read_csv('./human_activity/y_train.txt',sep='\s+',header=None,names=['action'])
y_test = pd.read_csv('./human_activity/y_test.txt',sep='\s+',header=None,names=['action'])

#### 1.2 GBM 학습, 예측, 평가

In [7]:
# GBM 수행 시간 측정을 위함. 시작 시간 설정.
import time
import warnings

start_time = time.time()

In [8]:
# 배열 수정
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [9]:
# GBM 학습
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)


# GBM 예측, 평가
from sklearn.metrics import accuracy_score

y_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred)

print("GBM 정확도: {0:.4f}".format(gb_accuracy))
print("GBM 수행 시간: {0:.1f}".format(time.time() - start_time))
print("-----" * 20)

GBM 정확도: 0.9386
GBM 수행 시간: 537.4
----------------------------------------------------------------------------------------------------


In [10]:
# 튜닝할 파라미터들 정의
params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}

# GridSearchCV를 사용하여 최적의 파라미터 탐색
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(gb_clf, params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)

# 최적의 파라미터 출력
best_params = grid_cv.best_params_
print("최적의 파라미터:", best_params)

# 최적의 파라미터를 적용한 GBM 모델 생성 및 학습
best_gb_clf = GradientBoostingClassifier(random_state=0, **best_params)
best_gb_clf.fit(X_train, y_train)

# 예측 및 평가
y_pred_best = best_gb_clf.predict(X_test)
best_gb_accuracy = accuracy_score(y_test, y_pred_best)

# 결과 출력
print("최적 파라미터 적용 GBM 정확도: {0:.4f}".format(best_gb_accuracy))
print("최적 파라미터 적용 GBM 수행 시간: {0:.1f}".format(time.time() - start_time))

KeyboardInterrupt: 