In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
df_train = pd.read_csv('data/train.csv')

## 정보 확인

In [4]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
print(df_train.dtypes)

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


## 결측치 다루기

In [7]:
# 1. 결측치 확인
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [8]:
# 2. 1차 drop
df_train=df_train.drop(columns=['Name'])

In [9]:
df_train.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,True


In [10]:
# 3. HomePlanet, Destination, CryoSleep, VIP 결측치 제거(최빈값 채우기)
df_train[['Destination','HomePlanet','CryoSleep','VIP']].mode()
mode_list = ['Destination','HomePlanet','CryoSleep','VIP']
for i in mode_list:
    if i == 'Destination':
        df_train[i].fillna('TRAPPIST-1e', inplace = True)
    elif i == 'HomePlanet':
        df_train[i].fillna('Earth', inplace = True)
    elif i == 'CryoSleep':
        df_train[i].fillna(False, inplace = True)
    elif i == 'VIP':
        df_train[i].fillna(False, inplace = True)

In [11]:
df_train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [12]:
# 4. Age 결측치 제거(중앙값 채우기)
df_train['Age'].fillna(df_train['Age'].median(), inplace = True)

In [13]:
df_train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [14]:
# 5. 소비 항목 결측치 제거(최빈값 채우기) -> 수정
service_list=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for i in service_list:
    df_train[i].fillna(0, inplace = True)

In [15]:
df_train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

In [16]:
# 6. Cabin 결측치 제거
df_train['Cabin'].fillna('N/5000/N', inplace=True)

In [17]:
df_train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [18]:
df_train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep          bool
Cabin            object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

## 데이터 전처리

In [19]:
# 1. Cabin 전처리
df_train[['Group_num', 'Id_num']] = df_train['PassengerId'].str.split('_', expand = True)
df_train[['Deck', 'Num', 'Side']] = df_train['Cabin'].str.split('/', expand = True)

In [20]:
df_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_num,Id_num,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,1,A,0,S


In [21]:
# 2. PassengerId 전처리(Group_size 라벨링)
Train_Group_numlist = list(df_train['Group_num'].value_counts().sort_index())
Train_Group_size = []
for i in range(len(Train_Group_numlist)):
    for j in range(Train_Group_numlist[i]):
        Train_Group_size.append(Train_Group_numlist[i])

In [22]:
# 3. Id_num(PassengerId) 2차 drop
df_train = df_train.drop(columns = ['Id_num'])

In [23]:
df_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_num,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,A,0,S


In [24]:
# 4. df_train에 Group_size 추가
df_train['Group_size'] = Train_Group_size

In [25]:
df_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_num,Deck,Num,Side,Group_size
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,F,0,S,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,A,0,S,2


In [26]:
# 5. Cabin 위치 나누기
CabinNum_name = ['Cabin_loc1','Cabin_loc2','Cabin_loc3','Cabin_loc4','Cabin_loc5','Cabin_loc6','Cabin_loc7']
df_train['Num'] = df_train['Num'].astype(int)

df_train[CabinNum_name[0]] = (df_train['Num'] < 300)
df_train[CabinNum_name[1]] = ((df_train['Num'] >= 300) & (df_train['Num'] < 600))
df_train[CabinNum_name[2]] = ((df_train['Num'] >= 600) & (df_train['Num'] < 900))
df_train[CabinNum_name[3]] = ((df_train['Num'] >= 900) & (df_train['Num'] < 1200))
df_train[CabinNum_name[4]] = ((df_train['Num'] >= 1200) & (df_train['Num'] < 1500))
df_train[CabinNum_name[5]] = ((df_train['Num'] >= 1500) & (df_train['Num'] < 1800))
df_train[CabinNum_name[6]] = (df_train['Num'] >= 1800)

In [27]:
df_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Num,Side,Group_size,Cabin_loc1,Cabin_loc2,Cabin_loc3,Cabin_loc4,Cabin_loc5,Cabin_loc6,Cabin_loc7
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0,P,1,True,False,False,False,False,False,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,0,S,1,True,False,False,False,False,False,False
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,0,S,2,True,False,False,False,False,False,False


In [28]:
df_train['Luc_exp'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']

## one-hot encoding 

In [29]:
# 1. category 항목 인코딩
category_list = ['HomePlanet','Destination','Deck','Side']

train_hot_enc = pd.get_dummies(df_train[category_list])

train_final = df_train.copy()
train_final = pd.concat([train_final,train_hot_enc],axis = 1)

In [30]:
train_final.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_N,Deck_T,Side_N,Side_P,Side_S
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,0,0,0,1,0,0,0,0,0,1


In [31]:
# 3. 2차 drop
train_final2 = train_final.drop(columns = ['PassengerId', 'HomePlanet','Cabin', 'Destination','Group_num','Deck', 'Num', 'Side'])

In [32]:
# 5. X, Y 데이터 분리
Y_train_data = train_final2['Transported'].copy()
X_train_data = train_final2.drop(columns = ['Transported','VIP'])

In [33]:
Y_train_data.head(3)

0    False
1     True
2    False
Name: Transported, dtype: bool

In [34]:
X_train_data.head(3)

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group_size,Cabin_loc1,Cabin_loc2,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_N,Deck_T,Side_N,Side_P,Side_S
0,False,39.0,0.0,0.0,0.0,0.0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
1,False,24.0,109.0,9.0,25.0,549.0,44.0,1,True,False,...,0,0,0,1,0,0,0,0,0,1
2,False,58.0,43.0,3576.0,0.0,6715.0,49.0,2,True,False,...,0,0,0,0,0,0,0,0,0,1


In [41]:
from sklearn.model_selection import train_test_split
X_train_final, X_test, Y_train_final, Y_test = train_test_split(X_train_data, Y_train_data, test_size = 0.3, random_state = 42)

In [44]:
X_train_final.isnull().sum()

CryoSleep                    0
Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Group_size                   0
Cabin_loc1                   0
Cabin_loc2                   0
Cabin_loc3                   0
Cabin_loc4                   0
Cabin_loc5                   0
Cabin_loc6                   0
Cabin_loc7                   0
Luc_exp                      0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_N                       0
Deck_T                       0
Side_N                       0
Side_P  

In [46]:
X_train_final.shape

(6085, 34)

## 학습 - 모델 : Catboost

In [66]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations = 20000,eval_metric = 'Accuracy',verbose = 5000)

model.fit(X_train_final, Y_train_final)

Learning rate set to 0.001428
0:	learn: 0.7500411	total: 5.94ms	remaining: 1m 58s
5000:	learn: 0.8376335	total: 17.2s	remaining: 51.5s
10000:	learn: 0.8645850	total: 34.8s	remaining: 34.8s
15000:	learn: 0.8852917	total: 52.4s	remaining: 17.5s
19999:	learn: 0.9015612	total: 1m 10s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2b93fc661a0>

In [67]:
X_test_final = np.array(model.predict(X_test))

In [68]:
Y_test_final = np.array(Y_test)

In [69]:
idx = 0
count = 0

for j in X_test_final:
    if j == str(Y_test_final[idx]):
        count += 1
        idx += 1
    else:
        pass

print("Accuracy : ", (count/len(X_test_final)) * 100)

Accuracy :  49.424846625766875
