# 사용할 라이브러리 로딩

In [1]:
import numpy as np # Numpy
import pandas as pd # Pandas
import matplotlib as mpl #Matplotlib 세팅용
import matplotlib.pyplot as plt # 시각화 도구
import seaborn as sns # 시각화 도구
from sklearn.model_selection import train_test_split # 데이터셋 분리
from sklearn.cluster import KMeans # 클러스터링
from sklearn.metrics import silhouette_score # 실루엣 점수
from xgboost import XGBClassifier  # XGBoostClassifier
import xgboost as xgb # XGBoost
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials # 최적의 파람
from sklearn.model_selection import GridSearchCV # 그리드 서치
from sklearn.model_selection import cross_val_score # 교차 스코어
from sklearn.metrics import accuracy_score, precision_score # 평가 지표
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score, f1_score # 평가 지표


import warnings # 경고문 제거용


%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

# 한글 폰트 설정
mpl.rc('font', family='D2Coding')
# 유니코드에서 음수 부호 설정
mpl.rc('axes', unicode_minus = False)

warnings.filterwarnings('ignore')
sns.set(font="D2Coding", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc('figure', figsize=(10,8))

# 데이터 로딩

In [2]:
data = pd.read_csv('C:/Users/admin/Desktop/sparta/train.csv')

## 데이터 탐색

In [3]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Cabin 결측값 처리
- 결측값 확인 후 우선 Cabin부터 채워넣기로 함
- PassengerId의 첫 4자리 숫자는 승객의 그룹을 의미함으로 Cabin 결측값 중 그룹이 있으면 그룹의 Cabin으로 채워넣음
- train뿐만 아니라 test에도 결측값이 있기에 같이 처리하기로 함
- 그렇게 처리하고 나온 파일이 하단의 파일

In [7]:
data = pd.read_excel('C:/Users/admin/Desktop/sparta/train_test_origin.xlsx')

In [8]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,0.0,B,0.0,B0,P,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0
1,0002_01,Earth,0.0,F,0.0,F0,S,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1.0
2,0003_01,Europa,0.0,A,0.0,A0,S,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0
3,0003_02,Europa,0.0,A,0.0,A0,S,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0
4,0004_01,Earth,0.0,F,1.0,F1,S,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0


In [9]:
# 결측값을 처리하면 Cabin을 나눠서 구분함
# Cabin은 섹터/방번호/측면으로 구분되어 있음
# Cabin1은 섹터
# Cabin2는 방번호
# Combi는 Cabin1+Cabin2
# Cabin3는 측면(P(ort)는 좌현, S(tarboard)는 우현)
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin1', 'Cabin2', 'Combi',
       'Cabin3', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported'],
      dtype='object')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12691 non-null  object 
 2   CryoSleep     12660 non-null  float64
 3   Cabin1        12804 non-null  object 
 4   Cabin2        12804 non-null  float64
 5   Combi         12804 non-null  object 
 6   Cabin3        12804 non-null  object 
 7   Cabin         12804 non-null  object 
 8   Destination   12704 non-null  object 
 9   Age           12700 non-null  float64
 10  VIP           12674 non-null  float64
 11  RoomService   12776 non-null  float64
 12  FoodCourt     12752 non-null  float64
 13  ShoppingMall  12760 non-null  float64
 14  Spa           12754 non-null  float64
 15  VRDeck        12766 non-null  float64
 16  Name          12676 non-null  object 
 17  Transported   8693 non-null   float64
dtypes: float64(10), object(8)


- test 데이터는 Kaggle에 제출해야하는 데이터이기에 target인 Transported가 전부 결측임

In [11]:
data.isna().sum()

PassengerId        0
HomePlanet       279
CryoSleep        310
Cabin1           166
Cabin2           166
Combi            166
Cabin3           166
Cabin            166
Destination      266
Age              270
VIP              296
RoomService      194
FoodCourt        218
ShoppingMall     210
Spa              216
VRDeck           204
Name             294
Transported     4277
dtype: int64

- 나머지 결측값을 처리하기 위해 클러스터링을 해보기로 함
- 클러스터링 하기 전에 전처리를 진행해야함

# 전처리

## 필요없는 feature 제거

In [12]:
# 분류하는데 필요없다고 예상되는 'PassengerId', 'Name' feature 제거
data.drop(['PassengerId', 'Name'], inplace=True, axis=1)

In [13]:
# 제거 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12660 non-null  float64
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12804 non-null  object 
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12674 non-null  float64
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
 15  Transported   8693 non-null   float64
dtypes: float64(10), object(6)
memory usage: 1.6+ MB


## CryoSleep, VIP, Cabin3 boolean 타입으로 캐스팅

In [14]:
# Cabin3의 P(좌현)를 False으로 S(우현)를 True로 변경
data['Cabin3'].replace({'P': 'True','S': 'False'}, inplace=True)

In [15]:
# boolean으로 변환
data['CryoSleep'] = data['CryoSleep'].astype(bool)
data['VIP'] = data['VIP'].astype(bool)
data['Cabin3'] = data['Cabin3'].astype(bool)

In [16]:
# 변환 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12970 non-null  bool   
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12970 non-null  bool   
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12970 non-null  bool   
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
 15  Transported   8693 non-null   float64
dtypes: bool(3), float64(8), object(5)
memory usage: 1.3+ MB


In [17]:
# target 값을 제외하고 다른 변수에 저장
df = data.iloc[:,:15]

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12970 non-null  bool   
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12970 non-null  bool   
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12970 non-null  bool   
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
dtypes: bool(3), float64(7), object(5)
memory usage: 1.2+ MB


## 클러스터링을 위해 다른 결측값들을 전부 제거

In [19]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11076 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    11076 non-null  object 
 1   CryoSleep     11076 non-null  bool   
 2   Cabin1        11076 non-null  object 
 3   Cabin2        11076 non-null  float64
 4   Combi         11076 non-null  object 
 5   Cabin3        11076 non-null  bool   
 6   Cabin         11076 non-null  object 
 7   Destination   11076 non-null  object 
 8   Age           11076 non-null  float64
 9   VIP           11076 non-null  bool   
 10  RoomService   11076 non-null  float64
 11  FoodCourt     11076 non-null  float64
 12  ShoppingMall  11076 non-null  float64
 13  Spa           11076 non-null  float64
 14  VRDeck        11076 non-null  float64
dtypes: bool(3), float64(7), object(5)
memory usage: 1.1+ MB


## 원핫인코딩

In [20]:
# object 타입의 데이터들 더미화
train_encoding = pd.get_dummies(df['HomePlanet'])
df=df.drop('HomePlanet',axis=1)
df = df.join(train_encoding)

train_encoding = pd.get_dummies(df['Destination'])
# 기존의 팀명 컬러 삭제
df=df.drop('Destination',axis=1)
df = df.join(train_encoding)

train_encoding = pd.get_dummies(df['Cabin1'])
# 기존의 팀명 컬러 삭제
df=df.drop('Cabin1',axis=1)
df = df.join(train_encoding)

In [21]:
# 데이터 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11076 entries, 0 to 12969
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CryoSleep      11076 non-null  bool   
 1   Cabin2         11076 non-null  float64
 2   Combi          11076 non-null  object 
 3   Cabin3         11076 non-null  bool   
 4   Cabin          11076 non-null  object 
 5   Age            11076 non-null  float64
 6   VIP            11076 non-null  bool   
 7   RoomService    11076 non-null  float64
 8   FoodCourt      11076 non-null  float64
 9   ShoppingMall   11076 non-null  float64
 10  Spa            11076 non-null  float64
 11  VRDeck         11076 non-null  float64
 12  Earth          11076 non-null  uint8  
 13  Europa         11076 non-null  uint8  
 14  Mars           11076 non-null  uint8  
 15  55 Cancri e    11076 non-null  uint8  
 16  PSO J318.5-22  11076 non-null  uint8  
 17  TRAPPIST-1e    11076 non-null  uint8  
 18  A     

## 스케일링

In [22]:
# 스케일링을 위한 함수 생성
col = ['Cabin2', 'Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
def data_scaled(df, col):
    for i in col:
        data_mean = df[i].mean()
        data_std = df[i].std()
        scaled = (df[i]-data_mean)/data_std
        df[i]=scaled
    return df

In [23]:
data_scaled(df, col)

Unnamed: 0,CryoSleep,Cabin2,Combi,Cabin3,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,False,-1.171058,B0,True,B/0/P,0.707877,False,-0.334616,-0.282674,-0.285975,...,0,1,0,1,0,0,0,0,0,0
1,False,-1.171058,F0,True,F/0/S,-0.329018,False,-0.166861,-0.277057,-0.244125,...,0,1,0,0,0,0,0,1,0,0
2,False,-1.171058,A0,True,A/0/S,2.021278,True,-0.268437,1.949128,-0.285975,...,0,1,1,0,0,0,0,0,0,0
3,False,-1.171058,A0,True,A/0/S,0.293119,False,-0.334616,0.518053,0.335083,...,0,1,1,0,0,0,0,0,0,0
4,False,-1.169112,F1,True,F/1/S,-0.882029,False,0.131712,-0.238987,-0.033199,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12962,True,1.737845,G1495,True,G/1495/S,0.984383,False,-0.334616,-0.282674,-0.285975,...,0,1,0,0,0,0,0,0,1,0
12963,False,-0.630138,D278,True,D/278/S,0.984383,False,-0.262281,-0.282674,6.160647,...,0,1,0,0,0,1,0,0,0,0
12964,False,2.323517,F1796,True,F/1796/S,0.777004,False,-0.334616,0.257177,-0.285975,...,0,1,0,0,0,0,0,1,0,0
12965,True,1.739790,G1496,True,G/1496/S,0.362246,False,-0.334616,-0.282674,-0.285975,...,0,1,0,0,0,0,0,0,1,0


In [24]:
df.head()

Unnamed: 0,CryoSleep,Cabin2,Combi,Cabin3,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,False,-1.171058,B0,True,B/0/P,0.707877,False,-0.334616,-0.282674,-0.285975,...,0,1,0,1,0,0,0,0,0,0
1,False,-1.171058,F0,True,F/0/S,-0.329018,False,-0.166861,-0.277057,-0.244125,...,0,1,0,0,0,0,0,1,0,0
2,False,-1.171058,A0,True,A/0/S,2.021278,True,-0.268437,1.949128,-0.285975,...,0,1,1,0,0,0,0,0,0,0
3,False,-1.171058,A0,True,A/0/S,0.293119,False,-0.334616,0.518053,0.335083,...,0,1,1,0,0,0,0,0,0,0
4,False,-1.169112,F1,True,F/1/S,-0.882029,False,0.131712,-0.238987,-0.033199,...,0,1,0,0,0,0,0,1,0,0


## 클러스터링
- 필요없는 feature 추가로 제거한 후 진행

In [25]:
dt = df.drop(['Combi', 'Cabin'], axis=1)

In [26]:
k_range = range(2,30)

best_k = -1
best_silhouette_score = -1

for k in k_range:
    km = KMeans(n_clusters = k, random_state=109)
    km.fit(dt)
    clusters= km.predict(dt)
    
    score = silhouette_score(dt, clusters)
    
    print('k: {}, score: {}'.format(k, score))
    if score > best_silhouette_score:
        best_k = k
        best_silhouette_score = score
        
print('\n best K: {}, best Score: {}'.format(best_k, best_silhouette_score))

k: 2, score: 0.39969018335468287
k: 3, score: 0.14185156575717311
k: 4, score: 0.15151784045362093
k: 5, score: 0.1559061192440089
k: 6, score: 0.13272243195725927
k: 7, score: 0.14264793143217322
k: 8, score: 0.1523939867296424
k: 9, score: 0.15855572438088006
k: 10, score: 0.16848717649649886
k: 11, score: 0.1719417766830233
k: 12, score: 0.1767525504160753
k: 13, score: 0.15301345015328918
k: 14, score: 0.16995625293762828
k: 15, score: 0.17547035906816716
k: 16, score: 0.15851569352600384
k: 17, score: 0.16866166334971966
k: 18, score: 0.17945282042894817
k: 19, score: 0.1782550670939478
k: 20, score: 0.17939509762456127
k: 21, score: 0.17489397195225823
k: 22, score: 0.17714857616304974
k: 23, score: 0.1736955166982367
k: 24, score: 0.1745671601499613
k: 25, score: 0.17714199923930266
k: 26, score: 0.17156985331968433
k: 27, score: 0.17462637551984989
k: 28, score: 0.17879420423015113
k: 29, score: 0.18481786330498226

 best K: 2, best Score: 0.39969018335468287


- 군집화를 시켜보니 실루엣 점수가 너무 낮아서 이를 포기하고 feature 별로 분류하여 결측값을 채우기로 함

## CryoSleep 결측값 채우기
- XGBoost 활용하기

### 훈련셋 테스트셋 검증셋 분리

In [27]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop(['CryoSleep'],axis=1),dt.CryoSleep,
                                                    random_state=109)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

### 모델 생성(그리드서치)

In [28]:
# xgbo = xgb.XGBClassifier()

# params = {
#     'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, None],
#     'learning_rate':[0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4],
#     'gamma':[0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5],
#     'random_state':[109]
# }

# gs = GridSearchCV(xgbo, param_grid = params, cv = 3, refit = True, n_jobs=-1)
# gs.fit(X_train, y_train)

### 결과확인

In [29]:
# model = gs.best_estimator_
# print(model.score(X_train, y_train))
# print(model.score(X_test, y_test))
# print(model.score(X_val, y_val))

In [30]:
# # 최적의 파라미터값
# print(gs.best_params_)

In [31]:
xgbo = xgb.XGBClassifier(gamma=4, learning_rate=0.3, max_depth=11, random_state=109)
xgbo.fit(X_train, y_train)

In [32]:
train_pred = xgbo.predict(X_train)
train_proba = xgbo.predict_proba(X_train)

test_pred = xgbo.predict(X_test)
test_proba = xgbo.predict_proba(X_test)

val_pred = xgbo.predict(X_val)
val_proba = xgbo.predict_proba(X_val)

In [33]:
# 평가용 함수
def  get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
#     roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion)
 
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, \
    재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

#### 훈련셋 평가

In [34]:
get_clf_eval(y_train, train_pred, train_proba)

오차 행렬
[[3614  180]
 [ 155 2281]]
정확도: 0.9462, 정밀도: 0.9269,     재현율: 0.9364, F1: 0.9316


#### 테스트셋 평가

In [35]:
get_clf_eval(y_test, test_pred, test_proba)

오차 행렬
[[1590   96]
 [  69 1014]]
정확도: 0.9404, 정밀도: 0.9135,     재현율: 0.9363, F1: 0.9248


#### 검증셋 평가

In [36]:
get_clf_eval(y_val, val_pred, val_proba)

오차 행렬
[[1175   64]
 [  63  775]]
정확도: 0.9389, 정밀도: 0.9237,     재현율: 0.9248, F1: 0.9243


- CryoSleep XGBoost 모델은 0.93이상의 정확도와 다른 지표도 좋아 결측값 예측에 사용하기로 함

# HomePlanet 결측치 채우기
## 사본 df생성

In [72]:
# target 값을 제외하고 다른 변수에 저장
hp = data.iloc[:,:15]
# 그 외 사용하지 않는 피처 삭제
hp = hp.drop(['Cabin','Combi'], axis=1)

In [73]:
hp

Unnamed: 0,HomePlanet,CryoSleep,Cabin1,Cabin2,Cabin3,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B,0.0,True,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F,0.0,True,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A,0.0,True,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A,0.0,True,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F,1.0,True,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,Earth,True,G,1496.0,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0
12966,Earth,False,,,True,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0
12967,Mars,True,D,296.0,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0
12968,Europa,False,D,297.0,True,,,False,0.0,2680.0,0.0,0.0,523.0


In [74]:
hp.dropna(inplace=True)
hp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11076 entries, 0 to 12969
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    11076 non-null  object 
 1   CryoSleep     11076 non-null  bool   
 2   Cabin1        11076 non-null  object 
 3   Cabin2        11076 non-null  float64
 4   Cabin3        11076 non-null  bool   
 5   Destination   11076 non-null  object 
 6   Age           11076 non-null  float64
 7   VIP           11076 non-null  bool   
 8   RoomService   11076 non-null  float64
 9   FoodCourt     11076 non-null  float64
 10  ShoppingMall  11076 non-null  float64
 11  Spa           11076 non-null  float64
 12  VRDeck        11076 non-null  float64
dtypes: bool(3), float64(7), object(3)
memory usage: 984.3+ KB


In [75]:
# homeplanet이 타깃으로 할거라서 홈플레닛 값을 0,1,2로 치환하여 사용
hp.HomePlanet = hp['HomePlanet'].map({'Earth':0, 'Europa':1, 'Mars':2})

In [41]:
hp.HomePlanet.unique() # 변경 확인

array([1, 0, 2], dtype=int64)

## 원-핫 인코딩

In [76]:
# 원-핫 인코딩 (cabin1, destination)
## Cabin1
encoding = pd.get_dummies(hp.Cabin1)
hp = hp.drop('Cabin1', axis =1) # 기존 삭제
hp = hp.join(encoding) # 적용
## Destination
encoding = pd.get_dummies(hp.Destination)
hp = hp.drop('Destination', axis =1)
hp = hp.join(encoding)

In [77]:
hp.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin2,Cabin3,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,B,C,D,E,F,G,T,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,1,False,0.0,True,39.0,False,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,1
1,0,False,0.0,True,24.0,False,109.0,9.0,25.0,549.0,...,0,0,0,0,1,0,0,0,0,1
2,1,False,0.0,True,58.0,True,43.0,3576.0,0.0,6715.0,...,0,0,0,0,0,0,0,0,0,1
3,1,False,0.0,True,33.0,False,0.0,1283.0,371.0,3329.0,...,0,0,0,0,0,0,0,0,0,1
4,0,False,1.0,True,16.0,False,303.0,70.0,151.0,565.0,...,0,0,0,0,1,0,0,0,0,1


## 스케일링

In [78]:
# 위에 정의된 스케일링 함수 호출
data_scaled(hp, col)

Unnamed: 0,HomePlanet,CryoSleep,Cabin2,Cabin3,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,B,C,D,E,F,G,T,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,1,False,-1.171058,True,0.707877,False,-0.334616,-0.282674,-0.285975,-0.271173,...,1,0,0,0,0,0,0,0,0,1
1,0,False,-1.171058,True,-0.329018,False,-0.166861,-0.277057,-0.244125,0.217775,...,0,0,0,0,1,0,0,0,0,1
2,1,False,-1.171058,True,2.021278,True,-0.268437,1.949128,-0.285975,5.709312,...,0,0,0,0,0,0,0,0,0,1
3,1,False,-1.171058,True,0.293119,False,-0.334616,0.518053,0.335083,2.693687,...,0,0,0,0,0,0,0,0,0,1
4,0,False,-1.169112,True,-0.882029,False,0.131712,-0.238987,-0.033199,0.232025,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12962,0,True,1.737845,True,0.984383,False,-0.334616,-0.282674,-0.285975,-0.271173,...,0,0,0,0,0,1,0,0,0,1
12963,2,False,-0.630138,True,0.984383,False,-0.262281,-0.282674,6.160647,-0.271173,...,0,0,1,0,0,0,0,0,0,1
12964,0,False,2.323517,True,0.777004,False,-0.334616,0.257177,-0.285975,-0.268502,...,0,0,0,0,1,0,0,0,0,1
12965,0,True,1.739790,True,0.362246,False,-0.334616,-0.282674,-0.285975,-0.271173,...,0,0,0,0,0,1,0,0,0,1


## 모델링_랜덤포레스트

In [45]:
# 데이터 타깃 분리
hp_data = hp.drop('HomePlanet', axis=1)
hp_label = hp['HomePlanet']

In [46]:
# 임시 모델 설정
rfc = RandomForestClassifier()
# 그리드 서치
#grid = {
#    'n_estimators': [50,90,100,150,200, 250],
#    'max_depth': [3,5,7,9,13,15],
#    'min_samples_leaf':[3,5,7,9,13,15],
#    'min_samples_split': [3,5,7,9,13,15]
#}
# 그리드 객체
#rfc_grid = GridSearchCV(rfc, param_grid = grid, scoring = 'accuracy', cv=5, n_jobs=-1, 
                       verbose =1)
# fitting
#rfc_grid.fit(hp_data, hp_label)
#print('최고 평균 정확도 : {}'.format(rfc_grid.best_score_))
#print('최고 파라미터: {}', rfc_grid.best_params_)

IndentationError: unexpected indent (Temp/ipykernel_17356/1118589729.py, line 5)

In [47]:
# 최적의 파라미터를 이용한 모델링
rfc_model = RandomForestClassifier(n_estimators=200, max_depth=15, 
                                   min_samples_leaf=3, min_samples_split=9, random_state=109)

# fitting
rfc_model.fit(hp_data, hp_label)

# pred 
rfc_pred = rfc_model.predict(hp_data)
print('정확도 : ', accuracy_score(hp_label, rfc_pred))

정확도 :  0.9656915854098953


In [48]:
# 혼돈행렬
confusion_matrix(hp_label, rfc_pred)

array([[5830,    5,   98],
       [  13, 2742,   46],
       [ 170,   48, 2124]], dtype=int64)

### 세트 분리 후 모델링

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
hp.drop('HomePlanet', axis =1),hp['HomePlanet'], random_state =109) 

In [51]:
rf_clf = RandomForestClassifier()
grid = {
    'n_estimators': [50,90,100,150,200, 250],
    'max_depth': [3,5,7,9,13,15],
    'min_samples_leaf':[3,5,7,9,13,15],
    'min_samples_split': [3,5,7,9,13,15]}

clf_grid = GridSearchCV(rf_clf, param_grid = grid, scoring='accuracy', verbose=1, cv= 5, n_jobs=-1)

clf_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


NameError: name 'rfc_grid' is not defined

In [52]:
print('최고 평균 정확도 : {}'.format(clf_grid.best_score_))
print('최고 파라미터: {}', clf_grid.best_params_)

최고 평균 정확도 : 0.9388473155298411
최고 파라미터: {} {'max_depth': 15, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 150}


In [53]:
# 최적의 파라미터를 이용한 모델링
clf_model = RandomForestClassifier(n_estimators=150, max_depth=15, 
                                   min_samples_leaf=3, min_samples_split=5, random_state=109)

# fitting
clf_model.fit(X_train, y_train)

# pred 
clf_pred = clf_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, clf_pred))

정확도 :  0.9461899602744673


In [54]:
confusion_matrix(y_test, clf_pred)

array([[1446,    1,   30],
       [   5,  694,   11],
       [  87,   15,  480]], dtype=int64)

In [98]:
# Mars의 예측 정확도가 다소 낮은걸 확인
480/(87+15+480)

0.8247422680412371

## 모델링_XGBoost

In [88]:
# hyperopt 적용을 위한 설정 값 담기
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
               }

In [84]:
# 실행을 위한 함수 정의
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'], 
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}


In [92]:
# 최적의 파라미터 찾기
trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)

100%|███████████████████████████████████████████████| 50/50 [02:04<00:00,  2.49s/trial, best loss: -0.9445046346454796]
best: {'colsample_bytree': 0.5200342563927247, 'learning_rate': 0.12222072139496226, 'max_depth': 9.0, 'min_child_weight': 2.0}


In [93]:
# 모델링
xgb_model = XGBClassifier(n_estimators=200, learning_rate=round(best['learning_rate'], 5), 
                            max_depth=int(best['max_depth']), min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )
xgb_model.fit(X_train, y_train)
pred= xgb_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, pred))
print('혼돈행렬 : \n', confusion_matrix(y_test, pred))

정확도 :  0.9472733838931022
혼돈행렬 :  [[1428    2   47]
 [   2  689   19]
 [  65   11  506]]
