# 라이브러리 로딩

In [1]:
import numpy as np # Numpy
import pandas as pd # Pandas
import matplotlib as mpl #Matplotlib 세팅용
import matplotlib.pyplot as plt # 시각화 도구
import seaborn as sns # 시각화 도구
from sklearn.model_selection import train_test_split # 데이터셋 분리
from sklearn.model_selection import KFold # KFold 교차검증
from sklearn.cluster import KMeans # 클러스터링
from sklearn.metrics import silhouette_score # 실루엣 점수
import xgboost as xgb # XGBoost
from sklearn.model_selection import GridSearchCV # 그리드 서치
from sklearn.metrics import accuracy_score, precision_score # 평가 지표
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score, f1_score # 평가 지표
from imblearn.combine import SMOTEENN, SMOTETomek # 복합샘플링
from hyperopt import hp, fmin, tpe, Trials # HyperOPT

import warnings # 경고문 제거용


%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

# 한글 폰트 설정
mpl.rc('font', family='D2Coding')
# 유니코드에서 음수 부호 설정
mpl.rc('axes', unicode_minus = False)

warnings.filterwarnings('ignore')
sns.set(font="D2Coding", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc('figure', figsize=(10,8))

# 데이터 불러오기

In [56]:
data = pd.read_excel('train_test_na_filled.xlsx', sheet_name='Train')

# 전처리

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin1        8590 non-null   object 
 4   Cabin2        8590 non-null   float64
 5   Combi         8590 non-null   object 
 6   Cabin3        8590 non-null   object 
 7   Cabin         8590 non-null   object 
 8   Destination   8693 non-null   object 
 9   Age           8693 non-null   int64  
 10  VIP           8693 non-null   bool   
 11  RoomService   8693 non-null   int64  
 12  FoodCourt     8693 non-null   int64  
 13  ShoppingMall  8693 non-null   int64  
 14  Spa           8693 non-null   int64  
 15  VRDeck        8693 non-null   int64  
 16  Name          8493 non-null   object 
 17  Transported   8693 non-null   bool   
dtypes: bool(3), float64(1), int6

## 필요없는 features 제거

In [57]:
# 필요없는 features 제거
data.drop(['PassengerId', 'Cabin', 'Combi', 'Name',], axis=1, inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   bool   
 2   Cabin1        8590 non-null   object 
 3   Cabin2        8590 non-null   float64
 4   Cabin3        8590 non-null   object 
 5   Destination   8693 non-null   object 
 6   Age           8693 non-null   int64  
 7   VIP           8693 non-null   bool   
 8   RoomService   8693 non-null   int64  
 9   FoodCourt     8693 non-null   int64  
 10  ShoppingMall  8693 non-null   int64  
 11  Spa           8693 non-null   int64  
 12  VRDeck        8693 non-null   int64  
 13  Transported   8693 non-null   bool   
dtypes: bool(3), float64(1), int64(6), object(4)
memory usage: 772.6+ KB


## 처리하기 힘든 결측값 제거

In [4]:
data.isna().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin1          103
Cabin2          103
Combi           103
Cabin3          103
Cabin           103
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64

In [58]:
# 결측값들 제거(Cabin)
data.dropna(axis=0, inplace=True)

## Boolean 캐스팅

In [59]:
# Cabin3의 값을 변환
data['Cabin3'].replace({'P': True,'S': False}, inplace=True)
data['Cabin3'] = data['Cabin3'].astype(bool)

## 원핫인코딩

In [60]:
# 원핫인코딩
train_encoding = pd.get_dummies(data['HomePlanet'])
data=data.drop('HomePlanet',axis=1)
data = data.join(train_encoding)

train_encoding = pd.get_dummies(data['Destination'])
data=data.drop('Destination',axis=1)
data = data.join(train_encoding)

train_encoding = pd.get_dummies(data['Cabin1'])
data=data.drop('Cabin1',axis=1)
data = data.join(train_encoding)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8590 entries, 0 to 8692
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CryoSleep      8590 non-null   bool   
 1   Cabin2         8590 non-null   float64
 2   Cabin3         8590 non-null   bool   
 3   Age            8590 non-null   int64  
 4   VIP            8590 non-null   bool   
 5   RoomService    8590 non-null   int64  
 6   FoodCourt      8590 non-null   int64  
 7   ShoppingMall   8590 non-null   int64  
 8   Spa            8590 non-null   int64  
 9   VRDeck         8590 non-null   int64  
 10  Transported    8590 non-null   bool   
 11  Earth          8590 non-null   uint8  
 12  Europa         8590 non-null   uint8  
 13  Mars           8590 non-null   uint8  
 14  55 Cancri e    8590 non-null   uint8  
 15  PSO J318.5-22  8590 non-null   uint8  
 16  TRAPPIST-1e    8590 non-null   uint8  
 17  A              8590 non-null   uint8  
 18  B       

## 스케일링

In [43]:
# 스케일링
col = ['Cabin2', 'Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
def data_scaled(df, col):
    for i in col:
        data_mean = df[i].mean()
        data_std = df[i].std()
        scaled = (df[i]-data_mean)/data_std
        df[i]=scaled
    return df

In [44]:
data_scaled(data, col)

Unnamed: 0,CryoSleep,Cabin2,Cabin3,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,False,-1.170228,True,0.712274,False,-0.333743,-0.280785,-0.282832,-0.271469,-0.263361,...,0,1,0,1,0,0,0,0,0,0
1,False,-1.170228,False,-0.332624,False,-0.168530,-0.275148,-0.241196,0.220460,-0.224520,...,0,1,0,0,0,0,0,1,0,0
2,False,-1.170228,False,2.035811,True,-0.268567,1.959032,-0.282832,5.745469,-0.220106,...,0,1,1,0,0,0,0,0,0,0
3,False,-1.170228,False,0.294315,False,-0.333743,0.522818,0.335048,2.711463,-0.092988,...,0,1,1,0,0,0,0,0,0,0
4,False,-1.168274,False,-0.889902,False,0.125518,-0.236941,-0.031350,0.234796,-0.261596,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,-0.978724,True,0.851594,True,-0.333743,3.990274,-0.282832,1.200732,-0.198037,...,0,0,1,0,0,0,0,0,0,0
8689,True,1.758999,False,-0.750583,False,-0.333743,-0.280785,-0.282832,-0.271469,-0.263361,...,1,0,0,0,0,0,0,0,1,0
8690,False,1.760953,False,-0.193304,False,-0.333743,-0.280785,2.834877,-0.270573,-0.263361,...,0,1,0,0,0,0,0,0,1,0
8691,False,0.017878,False,0.224655,False,-0.333743,0.376253,-0.282832,0.044835,2.592370,...,0,0,0,0,0,0,1,0,0,0


# 알고리즘 명 삽입

In [63]:
# xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 15, 1),
#                    'min_child_weight': hp.quniform('min_child_weight', 1, 20, 1),
#                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),
#                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
#                    'gamma': hp.uniform('gamma', 0, 4)}

In [67]:
# # fmin()에서 호출 시 search_space 값으로 XGBClassifier 교차 검증 학습 후 -1 * roc_auc 평균 값을 반환
# def bin_objective_func(search_space):
#     xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
#                             min_child_weight=int(search_space['min_child_weight']),
#                             colsample_bytree=search_space['colsample_bytree'],
#                             learning_rate=search_space['learning_rate'],
#                             gamma=search_space['gamma'])
    
#     # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
#     roc_auc_list = []
    
#     # 3개 k-fold 방식 적용
#     kf = KFold(n_splits=3)
    
#     # X_train을 다시 학습과 검증용 데이터로 분리
#     for tr_index, val_index in kf.split(X_train):
#         # kf.split(X_train)으로 추출된 학습과 검증 index 값으로 학습과 검증 데이터 세트 분리
#         X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
#         X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]
        
#         # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행
#         xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
#                    eval_set=[(X_tr, y_tr), (X_val, y_val)])
        
#         # 1로 예측한 확률값 추출 후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
#         score = roc_auc_score(y_val, xgb_clf.predict_proba(X_val)[:,1])
#         roc_auc_list.append(score)
        
#     # 3개 k-fold로 계산된 roc_auc 값의 평균값을 반환하되,
#     # HyperOPT는 목적함수의 최솟값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환
#     return -1*np.mean(roc_auc_list)

In [74]:
# # 평가용 함수
# def  get_clf_eval(y_test, pred=None, pred_proba=None):
#     confusion = confusion_matrix(y_test, pred)
#     accuracy = accuracy_score(y_test, pred)
#     precision = precision_score(y_test, pred)
#     recall = recall_score(y_test, pred)
#     f1 = f1_score(y_test, pred)
# #     roc_auc = roc_auc_score(y_test, pred_proba)
    
#     print('오차 행렬')
#     print(confusion)
 
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, \
#     재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

In [76]:
# train_pred = xgbo.predict(X_train)
# train_proba = xgbo.predict_proba(X_train)

# test_pred = xgbo.predict(X_test)
# test_proba = xgbo.predict_proba(X_test)

# val_pred = xgbo.predict(X_val)
# val_proba = xgbo.predict_proba(X_val)

In [77]:
# get_clf_eval(y_train, train_pred, train_proba)

오차 행렬
[[2189  222]
 [ 261 2159]]
정확도: 0.9000, 정밀도: 0.9068,     재현율: 0.8921, F1: 0.8994


In [78]:
# get_clf_eval(y_test, test_pred, test_proba)

오차 행렬
[[854 213]
 [233 848]]
정확도: 0.7924, 정밀도: 0.7992,     재현율: 0.7845, F1: 0.7918


In [79]:
# get_clf_eval(y_val, val_pred, val_proba)

오차 행렬
[[632 147]
 [155 677]]
정확도: 0.8125, 정밀도: 0.8216,     재현율: 0.8137, F1: 0.8176
