# 분류
## 결정트리

In [160]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 피처 이름 확인 
feature_name_df = pd.read_csv('/Users/linakim/Downloads/human_activity/features.txt',
                             sep='\s+', header=None, names=['column_index', 'column_name'])

feature_name = feature_name_df.iloc[:, 1].values.tolist()
print('전체 피처명에서 10개만 추출:', feature_name[:10])

전체 피처명에서 10개만 추출: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [181]:
### features.txt 파일에 있는 컬럼명을 입력 받아서 중복된 컬럼명은 원본 컬럼명+_1, _2와 같이 중복된 차수를 원본 컬럼명에 더해서 컬럼명을 update 하는 함수임. . 
def get_new_feature_name_df(old_feature_name_df):
    #column_name으로 중복된 컬럼명에 대해서는 중복 차수 부여, col1, col1과 같이 2개의 중복 컬럼이 있을 경우 1, 2 
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    # feature_dup_df의 index인 column_name을 reset_index()를 이용하여 컬럼으로 변환. 
    feature_dup_df = feature_dup_df.reset_index()
    # 인자로 받은 features_txt의 컬럼명 DataFrame과 feature_dup_df를 조인. 
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    # 새로운 컬럼명은 앞에 중복 차수를 접미어로 결합. 
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
                                                                                           if x[1] >0 else x[0] ,  axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [182]:
pd.options.display.max_rows = 999
new_feature_name_df = get_new_feature_name_df(feature_name_df)
new_feature_name_df[new_feature_name_df['dup_cnt'] > 0]

Unnamed: 0,column_index,column_name,dup_cnt
316,317,"fBodyAcc-bandsEnergy()-1,8_1",1
317,318,"fBodyAcc-bandsEnergy()-9,16_1",1
318,319,"fBodyAcc-bandsEnergy()-17,24_1",1
319,320,"fBodyAcc-bandsEnergy()-25,32_1",1
320,321,"fBodyAcc-bandsEnergy()-33,40_1",1
321,322,"fBodyAcc-bandsEnergy()-41,48_1",1
322,323,"fBodyAcc-bandsEnergy()-49,56_1",1
323,324,"fBodyAcc-bandsEnergy()-57,64_1",1
324,325,"fBodyAcc-bandsEnergy()-1,16_1",1
325,326,"fBodyAcc-bandsEnergy()-17,32_1",1


In [183]:
def get_human_dataset():
    feature_name_df = pd.read_csv('/Users/linakim/Downloads/human_activity/features.txt',
                             sep='\s+', header=None, names=['column_index', 'column_name'])
    
    # 중복된 feature명을 새롭게 수정하는 get_new_feature_name_df()를 이용하여 새로운 feature명 DataFrame생성. 
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    feature_name = feature_name_df.iloc[:, 1].values.tolist()
    
    Xt = pd.read_csv('/Users/linakim/Downloads/human_activity/train/X_train.txt',
                             sep='\s+', names=feature_name)
    Xtt = pd.read_csv('/Users/linakim/Downloads/human_activity/test/X_test.txt',
                             sep='\s+', names=feature_name)
    
    Yt = pd.read_csv('/Users/linakim/Downloads/human_activity/train/y_train.txt',
                             sep='\s+', header=None, names=['action'])
    Ytt = pd.read_csv('/Users/linakim/Downloads/human_activity/test/y_test.txt',
                             sep='\s+', header=None, names=['action'])
    
    return Xt, Xtt, Yt, Ytt



In [184]:
X_train, X_test, y_train, y_test = get_human_dataset()

ValueError: Duplicate names are not allowed.

## 앙상블 학습

In [186]:
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [191]:
# 개별모델 : 로지스틱 회귀, KNN
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('LR', lr_clf), ('KNN', knn_clf)], voting='soft')

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    test_size=0.2, random_state=156)

# VotingClassifier 학습/예측/평가
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

# 개별 모델 학습/예측/평가
Classifiers = [lr_clf, knn_clf]
for classifier in Classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도 : {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))
    

Voting 분류기 정확도 : 0.9386
LogisticRegression 정확도 : 0.9386
KNeighborsClassifier 정확도 : 0.9035


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 랜덤 포레스트

In [192]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 위의 유방암 데이터 활용
rl_clf = RandomForestClassifier(random_state=0)
rl_clf.fit(X_train, y_train)
pred = rl_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤포레스트 정확도 : {0:.4f}'.format(accuracy))

랜덤포레스트 정확도 : 0.9561


In [194]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [8, 16, 20]
}

rf_clf = RandomForestClassifier(random_state=1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))


최적 하이퍼 파라미터:
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고 예측 정확도:0.9517


## GBM
가중치 업데이터를 경사하강법을 이용하여 함. <br>
부스팅 : 여러 개의 약한 학습기를 순차적으로 학습-예측하면서 잘못 예측한 데이터에 가중치 부여를 통해 오류를 개선해 나가면서 학습

In [195]:
from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')

# 수행시간 측정을 위한 시작 시간 설정
start_time = time.time()

gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, pred)

print('GBM 정확도: {0:.4f}'.format(gb_accuracy))
print('GBM 수행 시간: {0:.1f} 초'.format(time.time() - start_time))
# 일반적으로 GBM이 랜덤 포레스트보다는 예측 성능이 조금 뛰어남 but 수행시간 오래걸림 

GBM 정확도: 0.9561
GBM 수행 시간: 0.3 초


In [196]:
# 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100, 500],
    'learning_rate':[0.05, 0.1]
}

gb_clf = GradientBoostingClassifier()
grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1)
grid_cv.fit(X_train, y_train)
print('최적의 파라미터 :\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_) )

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.7s finished


최적의 파라미터 :
 {'learning_rate': 0.05, 'n_estimators': 500}
최고 예측 정확도: 0.9495


## XGBoost
- 한번에 병렬로 돌리기때문에 GBM보다 빠름
- GBM의 경우 n_estimators에 지정된 횟수만큼 반복적으로 학습 진행하며 중간에 멈출 수 없지만, XGBoost는 예측 오류가 더 이상 개선되지 않으면 반복을 끝까지 수행하지 않고 중지 가능 
- 자체적으로 교차 검증, 성능 평가, 피처 중요도 등의 시각화 기능 탑재
- 별도의 설치 필요

In [199]:
import xgboost as xgb
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [200]:
dataset = load_breast_cancer()
X_features = dataset.data
y_label = dataset.target
cancer_df = pd.DataFrame(X_features, columns=dataset.feature_names)
cancer_df['target'] = y_label
cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [201]:
# 타겟 확인
print(dataset.target_names)
print(cancer_df['target'].value_counts())

['malignant' 'benign']
1    357
0    212
Name: target, dtype: int64


In [None]:
# 학습, 테스트 나누기
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, 
                                                   test_size=0.2, random_state=101)
print(X_train.shape, X_test.shape)

- DMatrix : 넘파이 입력 파라미터를 받아서 만들어지는 XGBoost만의 전용 데이터 세트. <br>
주요 입력 파라미터 : data와 label

In [203]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [204]:
# 학습 전 하이퍼파라미터 설정
params = {
    'max_depth':3, # 트리 최대 깊이 
    'eta':0.1, # 학습률 (=GBM의 learning_rate)
    'objective':'binary:logistic', # 목적함수 : 이진분류 문제이므로 이진 로지스틱
    'eval_metric':'logloss', # 오류함수의 평가 성능 지표 : logloss
    'early_stoppings':100 
}
num_rounds = 400 # 부스팅 반복 횟수 

In [205]:
# train은 'train', evaluation(test)는 'eval'로 명기
wlist = [(dtrain, 'train'), (dtest, 'eval')]
# 하이퍼 파라미터와 early stopping 파라미터를 train() 함수의 파라미터로 전달
xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_rounds, evals=wlist)

Parameters: { "early_stoppings" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.60968	eval-logloss:0.61696
[1]	train-logloss:0.54117	eval-logloss:0.55580
[2]	train-logloss:0.48378	eval-logloss:0.50508
[3]	train-logloss:0.43389	eval-logloss:0.45831
[4]	train-logloss:0.39121	eval-logloss:0.42010
[5]	train-logloss:0.35293	eval-logloss:0.38246
[6]	train-logloss:0.32086	eval-logloss:0.35329
[7]	train-logloss:0.29170	eval-logloss:0.32484
[8]	train-logloss:0.26809	eval-logloss:0.30152
[9]	train-logloss:0.24524	eval-logloss:0.27925
[10]	train-logloss:0.22526	eval-logloss:0.25896
[11]	train-logloss:0.20790	eval-logloss:0.24305
[12]	train-logloss:0.19133	eval-logloss:0.22470
[13]	train-logloss:0.17685	eval-logloss:0.21048
[14]	train-logloss:0.16379	eval-logloss:0.198

[162]	train-logloss:0.00768	eval-logloss:0.04163
[163]	train-logloss:0.00766	eval-logloss:0.04161
[164]	train-logloss:0.00764	eval-logloss:0.04162
[165]	train-logloss:0.00759	eval-logloss:0.04185
[166]	train-logloss:0.00755	eval-logloss:0.04214
[167]	train-logloss:0.00753	eval-logloss:0.04195
[168]	train-logloss:0.00751	eval-logloss:0.04183
[169]	train-logloss:0.00749	eval-logloss:0.04201
[170]	train-logloss:0.00745	eval-logloss:0.04217
[171]	train-logloss:0.00743	eval-logloss:0.04199
[172]	train-logloss:0.00739	eval-logloss:0.04222
[173]	train-logloss:0.00737	eval-logloss:0.04210
[174]	train-logloss:0.00733	eval-logloss:0.04189
[175]	train-logloss:0.00731	eval-logloss:0.04188
[176]	train-logloss:0.00730	eval-logloss:0.04189
[177]	train-logloss:0.00728	eval-logloss:0.04206
[178]	train-logloss:0.00726	eval-logloss:0.04212
[179]	train-logloss:0.00725	eval-logloss:0.04205
[180]	train-logloss:0.00723	eval-logloss:0.04223
[181]	train-logloss:0.00722	eval-logloss:0.04229
[182]	train-logloss:

[330]	train-logloss:0.00567	eval-logloss:0.03964
[331]	train-logloss:0.00567	eval-logloss:0.03962
[332]	train-logloss:0.00566	eval-logloss:0.03962
[333]	train-logloss:0.00565	eval-logloss:0.03960
[334]	train-logloss:0.00564	eval-logloss:0.03962
[335]	train-logloss:0.00564	eval-logloss:0.03962
[336]	train-logloss:0.00563	eval-logloss:0.03947
[337]	train-logloss:0.00562	eval-logloss:0.03940
[338]	train-logloss:0.00562	eval-logloss:0.03940
[339]	train-logloss:0.00561	eval-logloss:0.03941
[340]	train-logloss:0.00561	eval-logloss:0.03940
[341]	train-logloss:0.00560	eval-logloss:0.03939
[342]	train-logloss:0.00559	eval-logloss:0.03938
[343]	train-logloss:0.00559	eval-logloss:0.03925
[344]	train-logloss:0.00558	eval-logloss:0.03928
[345]	train-logloss:0.00557	eval-logloss:0.03913
[346]	train-logloss:0.00557	eval-logloss:0.03903
[347]	train-logloss:0.00556	eval-logloss:0.03902
[348]	train-logloss:0.00555	eval-logloss:0.03890
[349]	train-logloss:0.00555	eval-logloss:0.03897
[350]	train-logloss:

In [206]:
# 이제 테스트 세트에 예측 수행해보자. xgb.train() 함수는 학습이 완료된 모델 객체를 반환. 
# XGBoost의 predict() : 사이킷런의 predict()처럼 0또는 1의 class가 아니라, 확률을 반환 
pred_probs = xgb_model.predict(dtest)
print('predict() 수행 결괏값을 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10], 3))

# 예측 확률값이 0.5보다 크면 1, 그렇지 않으면 0으로 
preds = [ 1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시:', preds[:10])

predict() 수행 결괏값을 10개만 표시, 예측 확률값으로 표시됨
[0.999 0.999 1.    0.026 0.999 1.    0.999 0.    1.    0.999]
예측값 10개만 표시: [1, 1, 1, 0, 1, 1, 1, 0, 1, 1]


In [207]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    # F1 스코어 추가
    f1 = f1_score(y_test, pred)
    print('오차행렬')
    print(confusion)
    # f1 score print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy, precision, recall, f1))
    

In [208]:
# 성능 평가
get_clf_eval(y_test, preds)

오차행렬
[[40  2]
 [ 0 72]]
정확도: 0.9825, 정밀도: 0.9730, 재현율: 1.0000, F1:0.9863


### 사이킷런 래퍼 XGBoost의 개요 및 적용
- 다른 estimator과 동일하게 fit과 predict만으로 가능
- XGBClassifier, XGBRegressor
- eta => learning_rate
- sub_sample => subsample
- lambda => reg_lambda
- alpha => reg_alpha

In [209]:
from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_wrapper.fit(X_train, y_train)
w_preds = xgb_wrapper.predict(X_test)



In [210]:
get_clf_eval(y_test, w_preds)

오차행렬
[[40  2]
 [ 0 72]]
정확도: 0.9825, 정밀도: 0.9730, 재현율: 1.0000, F1:0.9863


In [211]:
# 조기중단 적용해보기 (시험엔 안나올듯)
# 원래대로라면 평가 데이터세트로 완전히 알려지지 않은 데이터 세트를 사용해야 함
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
evals = [(X_test, y_test)]
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss', 
               eval_set=evals, verbose=True)
ws100_preds = xgb_wrapper.predict(X_test)

[0]	validation_0-logloss:0.61696
[1]	validation_0-logloss:0.55580
[2]	validation_0-logloss:0.50508
[3]	validation_0-logloss:0.45831
[4]	validation_0-logloss:0.42010
[5]	validation_0-logloss:0.38246
[6]	validation_0-logloss:0.35329
[7]	validation_0-logloss:0.32484
[8]	validation_0-logloss:0.30152
[9]	validation_0-logloss:0.27925
[10]	validation_0-logloss:0.25896
[11]	validation_0-logloss:0.24305
[12]	validation_0-logloss:0.22470
[13]	validation_0-logloss:0.21048
[14]	validation_0-logloss:0.19892
[15]	validation_0-logloss:0.18942
[16]	validation_0-logloss:0.17883
[17]	validation_0-logloss:0.16914
[18]	validation_0-logloss:0.16154
[19]	validation_0-logloss:0.15410
[20]	validation_0-logloss:0.14665
[21]	validation_0-logloss:0.14120
[22]	validation_0-logloss:0.13588
[23]	validation_0-logloss:0.12936
[24]	validation_0-logloss:0.12345
[25]	validation_0-logloss:0.11866
[26]	validation_0-logloss:0.11580
[27]	validation_0-logloss:0.11142
[28]	validation_0-logloss:0.10797
[29]	validation_0-loglos

[238]	validation_0-logloss:0.04290
[239]	validation_0-logloss:0.04273
[240]	validation_0-logloss:0.04271
[241]	validation_0-logloss:0.04252
[242]	validation_0-logloss:0.04264
[243]	validation_0-logloss:0.04262
[244]	validation_0-logloss:0.04259
[245]	validation_0-logloss:0.04254
[246]	validation_0-logloss:0.04262
[247]	validation_0-logloss:0.04257
[248]	validation_0-logloss:0.04256
[249]	validation_0-logloss:0.04238
[250]	validation_0-logloss:0.04236
[251]	validation_0-logloss:0.04234
[252]	validation_0-logloss:0.04234
[253]	validation_0-logloss:0.04215
[254]	validation_0-logloss:0.04216
[255]	validation_0-logloss:0.04226
[256]	validation_0-logloss:0.04238


## LightGBM
- XGBoost보다 학습 시간이 적음 
- 적은 데이터 세트에 적용할 경우 과적합이 발생하기 쉬움
- 카테고리형 피처의 자동 변환과 최적분할(원-핫 인코딩 등을 사용하지 않고도 카테고리형 피처를 최적으로 변환하고 이에 따른 노드 분할 수행)

In [213]:
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()
ftr = dataset.data
target = dataset.target

X_train, X_test, y_train, y_test = train_test_split(ftr, target, test_size=0.2, random_state=101)

lgbm_wrapper = LGBMClassifier(n_estimators=400)

evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss',
                eval_set=evals, verbose=True)

preds=lgbm_wrapper.predict(X_test)

[1]	valid_0's binary_logloss: 0.578529
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.515964
[3]	valid_0's binary_logloss: 0.469202
[4]	valid_0's binary_logloss: 0.424632
[5]	valid_0's binary_logloss: 0.386011
[6]	valid_0's binary_logloss: 0.353233
[7]	valid_0's binary_logloss: 0.326411
[8]	valid_0's binary_logloss: 0.299836
[9]	valid_0's binary_logloss: 0.275654
[10]	valid_0's binary_logloss: 0.256402
[11]	valid_0's binary_logloss: 0.240247
[12]	valid_0's binary_logloss: 0.225731
[13]	valid_0's binary_logloss: 0.212671
[14]	valid_0's binary_logloss: 0.196971
[15]	valid_0's binary_logloss: 0.185334
[16]	valid_0's binary_logloss: 0.175053
[17]	valid_0's binary_logloss: 0.16534
[18]	valid_0's binary_logloss: 0.154417
[19]	valid_0's binary_logloss: 0.145069
[20]	valid_0's binary_logloss: 0.136122
[21]	valid_0's binary_logloss: 0.130632
[22]	valid_0's binary_logloss: 0.12372
[23]	valid_0's binary_logloss: 0.117074
[24]	valid_0's binary_logloss

[252]	valid_0's binary_logloss: 0.0201792
[253]	valid_0's binary_logloss: 0.020044
[254]	valid_0's binary_logloss: 0.0201971
[255]	valid_0's binary_logloss: 0.0200913
[256]	valid_0's binary_logloss: 0.0199394
[257]	valid_0's binary_logloss: 0.0200736
[258]	valid_0's binary_logloss: 0.0199191
[259]	valid_0's binary_logloss: 0.0199385
[260]	valid_0's binary_logloss: 0.0197891
[261]	valid_0's binary_logloss: 0.0196849
[262]	valid_0's binary_logloss: 0.0197073
[263]	valid_0's binary_logloss: 0.0196221
[264]	valid_0's binary_logloss: 0.0197047
[265]	valid_0's binary_logloss: 0.0197282
[266]	valid_0's binary_logloss: 0.0196294
[267]	valid_0's binary_logloss: 0.0194102
[268]	valid_0's binary_logloss: 0.0195376
[269]	valid_0's binary_logloss: 0.0194394
[270]	valid_0's binary_logloss: 0.0194632
[271]	valid_0's binary_logloss: 0.0193844
[272]	valid_0's binary_logloss: 0.0194755
[273]	valid_0's binary_logloss: 0.0193814
[274]	valid_0's binary_logloss: 0.0194075
[275]	valid_0's binary_logloss: 0.0

## 분류 실습 - 캐글 산탄데르 고객 만족 예측
### 데이터 전처리

In [214]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

cust_df = pd.read_csv('/Users/linakim/Downloads/santander-customer-satisfaction/train_santander.csv')   
print('dataset shape:', cust_df.shape)
cust_df.head(3)

dataset shape: (76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


In [215]:
# null값 알아보기
cust_df.isna().sum()

ID                               0
var3                             0
var15                            0
imp_ent_var16_ult1               0
imp_op_var39_comer_ult1          0
imp_op_var39_comer_ult3          0
imp_op_var40_comer_ult1          0
imp_op_var40_comer_ult3          0
imp_op_var40_efect_ult1          0
imp_op_var40_efect_ult3          0
imp_op_var40_ult1                0
imp_op_var41_comer_ult1          0
imp_op_var41_comer_ult3          0
imp_op_var41_efect_ult1          0
imp_op_var41_efect_ult3          0
imp_op_var41_ult1                0
imp_op_var39_efect_ult1          0
imp_op_var39_efect_ult3          0
imp_op_var39_ult1                0
imp_sal_var16_ult1               0
ind_var1_0                       0
ind_var1                         0
ind_var2_0                       0
ind_var2                         0
ind_var5_0                       0
ind_var5                         0
ind_var6_0                       0
ind_var6                         0
ind_var8_0          

In [216]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [222]:
# target 속성의 분포값 알아보기
cust_df['TARGET'].value_counts()
unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1].TARGET.count()
total_cnt = cust_df['TARGET'].count()
print(unsatisfied_cnt)
print('불만족 비율 : {0:.2f}'.format(unsatisfied_cnt/total_cnt))

3008
불만족 비율 : 0.04


In [223]:
cust_df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [224]:
# var3의 최솟값인 -999999를 최빈값인 2로 변환
cust_df['var3'].replace(-999999, 2, inplace=True)

# 필요없는 ID 칼럼 드랍
cust_df.drop('ID', axis=1, inplace=True)

# 피처 세트와 레이블 세트로 분리
X_features = cust_df.iloc[:, :-1]
y_labels = cust_df.iloc[:, -1]
print('피처 데이터 shape:{0}'.format(X_features.shape))

피처 데이터 shape:(76020, 369)


In [226]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2,
                                                   random_state=101)
# 비대칭한 데이터 세트이므로(불만족이 훨씬 적으니까) target 값 분포도가 학습 데이터와 테스트 데이터 세트에 모두 비슷하게 추출됬는지 확인
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.count().shape, X_test.count().shape))
print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/y_train.count())
print('\n테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/y_test.count())
# => 원본 데이터세트와 비슷하게 전체의 4%가 불만족으로 만들어짐.

학습 세트 Shape:(369,), 테스트 세트 Shape:(369,)
학습 세트 레이블 값 분포 비율
0    0.960997
1    0.039003
Name: TARGET, dtype: float64

테스트 세트 레이블 값 분포 비율
0    0.958169
1    0.041831
Name: TARGET, dtype: float64


### XGBoost 모델 학습과 하이퍼 파라미터 튜닝

In [228]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=200, random_state=101)

xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='auc', 
            eval_set=[(X_train, y_train), (X_test, y_test)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro')
print('ROC_AUC : {0:.4f}'.format(xgb_roc_score))

[0]	validation_0-auc:0.81646	validation_1-auc:0.80893
[1]	validation_0-auc:0.83208	validation_1-auc:0.82233
[2]	validation_0-auc:0.83861	validation_1-auc:0.82762
[3]	validation_0-auc:0.84431	validation_1-auc:0.83149
[4]	validation_0-auc:0.84806	validation_1-auc:0.83237
[5]	validation_0-auc:0.85336	validation_1-auc:0.83450
[6]	validation_0-auc:0.85604	validation_1-auc:0.83574
[7]	validation_0-auc:0.85936	validation_1-auc:0.83565
[8]	validation_0-auc:0.86152	validation_1-auc:0.83553
[9]	validation_0-auc:0.86557	validation_1-auc:0.83510
[10]	validation_0-auc:0.86794	validation_1-auc:0.83532
[11]	validation_0-auc:0.87147	validation_1-auc:0.83553
[12]	validation_0-auc:0.87381	validation_1-auc:0.83644
[13]	validation_0-auc:0.87672	validation_1-auc:0.83683
[14]	validation_0-auc:0.87859	validation_1-auc:0.83629
[15]	validation_0-auc:0.88031	validation_1-auc:0.83524
[16]	validation_0-auc:0.88199	validation_1-auc:0.83605
[17]	validation_0-auc:0.88378	validation_1-auc:0.83675
[18]	validation_0-au

In [None]:
# 똑같이 gridsearchCV 적용 가능

### LightGBM 모델 학습과 하이퍼 파리미터 튜닝

In [230]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimator=500, random_state=101)

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)],
            eval_metric='auc', verbose=True)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:, 1], average='macro')
print('ROC_AUC:{0:.4f}'.format(lgbm_roc_score))

[1]	valid_0's auc: 0.812002	valid_0's binary_logloss: 0.165688
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.821776	valid_0's binary_logloss: 0.160575
[3]	valid_0's auc: 0.823584	valid_0's binary_logloss: 0.156802
[4]	valid_0's auc: 0.828142	valid_0's binary_logloss: 0.154087
[5]	valid_0's auc: 0.828692	valid_0's binary_logloss: 0.152019
[6]	valid_0's auc: 0.830304	valid_0's binary_logloss: 0.150254
[7]	valid_0's auc: 0.830738	valid_0's binary_logloss: 0.148804
[8]	valid_0's auc: 0.831471	valid_0's binary_logloss: 0.147732
[9]	valid_0's auc: 0.831453	valid_0's binary_logloss: 0.14677
[10]	valid_0's auc: 0.831625	valid_0's binary_logloss: 0.145935
[11]	valid_0's auc: 0.831826	valid_0's binary_logloss: 0.145296
[12]	valid_0's auc: 0.83211	valid_0's binary_logloss: 0.144735
[13]	valid_0's auc: 0.832645	valid_0's binary_logloss: 0.144193
[14]	valid_0's auc: 0.833135	valid_0's binary_logloss: 0.143733
[15]	valid_0's auc: 0.83398	valid_0's binary_logloss:

## SMOTE 오버 샘플링 적용 후 모델 학습/예측/평가

In [3]:
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'