# Spaceship Titanic

In [117]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
space_real = pd.read_csv('test.csv')

In [3]:
space = pd.read_csv('train.csv')

In [4]:
space.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
space.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
space.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
space.drop_duplicates(inplace=True)

In [8]:
space['HomePlanet'].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

### 정보 파악하기

In [9]:
space.groupby('Transported')['Age'].mean()

Transported
False    29.922858
True     27.748834
Name: Age, dtype: float64

In [10]:
space.groupby(['VIP','Transported']).size()

VIP    Transported
False  False          4093
       True           4198
True   False           123
       True             76
dtype: int64

In [11]:
space.groupby(['VIP','HomePlanet']).size()

VIP    HomePlanet
False  Earth         4487
       Europa        1958
       Mars          1653
True   Europa         131
       Mars            63
dtype: int64

### space['HomePlanet'] null 값 바꾸기

In [12]:
space['Group'] = space['PassengerId'].str[:4]
#

In [13]:
home_group = space.groupby(['Group', 'HomePlanet']).size().reset_index()

In [14]:
home_group[home_group['Group'].duplicated()] # 중복된 값 없음. 유일한 값. == 그룹과 홈플래닛이 일치하다는 뜻.

Unnamed: 0,Group,HomePlanet,0


In [15]:
home_group.head()

Unnamed: 0,Group,HomePlanet,0
0,1,Europa,1
1,2,Earth,1
2,3,Europa,2
3,4,Earth,1
4,5,Earth,1


In [16]:
home_group[home_group['Group'] == '0064']

Unnamed: 0,Group,HomePlanet,0
41,64,Mars,1


In [17]:
space[space['HomePlanet'].isnull()][:3]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True,64
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False,119
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True,210


In [18]:
dic = {}
for k, v in zip(home_group['Group'], home_group['HomePlanet']):
    dic[k] = v
dic

{'0001': 'Europa',
 '0002': 'Earth',
 '0003': 'Europa',
 '0004': 'Earth',
 '0005': 'Earth',
 '0006': 'Earth',
 '0007': 'Earth',
 '0008': 'Europa',
 '0009': 'Mars',
 '0010': 'Earth',
 '0011': 'Earth',
 '0012': 'Earth',
 '0014': 'Mars',
 '0015': 'Earth',
 '0016': 'Mars',
 '0017': 'Earth',
 '0020': 'Earth',
 '0022': 'Mars',
 '0024': 'Europa',
 '0025': 'Earth',
 '0026': 'Europa',
 '0028': 'Mars',
 '0030': 'Earth',
 '0031': 'Mars',
 '0034': 'Europa',
 '0035': 'Mars',
 '0036': 'Earth',
 '0038': 'Earth',
 '0039': 'Earth',
 '0041': 'Earth',
 '0043': 'Europa',
 '0044': 'Earth',
 '0045': 'Mars',
 '0050': 'Earth',
 '0051': 'Earth',
 '0052': 'Earth',
 '0053': 'Earth',
 '0056': 'Europa',
 '0058': 'Earth',
 '0061': 'Earth',
 '0062': 'Earth',
 '0064': 'Mars',
 '0066': 'Earth',
 '0067': 'Earth',
 '0068': 'Mars',
 '0069': 'Earth',
 '0070': 'Earth',
 '0071': 'Earth',
 '0072': 'Earth',
 '0073': 'Mars',
 '0074': 'Europa',
 '0076': 'Mars',
 '0077': 'Mars',
 '0078': 'Europa',
 '0081': 'Earth',
 '0082': 'Mar

In [19]:
space['Group'].map(dic)

0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: Group, Length: 8693, dtype: object

In [20]:
space['HomePlanet'].fillna(space['Group'].map(dic), inplace=True)

In [21]:
space['HomePlanet'].isnull().sum()

111

In [22]:
space[space['HomePlanet'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True,0210
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,0242
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,0251
274,0303_01,,True,G/41/S,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Oraryn Kirklander,True,0303
286,0315_01,,True,G/42/S,PSO J318.5-22,35.0,False,0.0,0.0,0.0,0.0,0.0,Adriet Valezaley,True,0315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8468,9043_01,,True,F/1848/P,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Cobix Erle,True,9043
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False,9084
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True,9248
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False,9257


In [23]:
space['last_name'] = space.Name.str.split(' ').str[-1]
space['last_name']

0         Ofracculy
1             Vines
2            Susent
3            Susent
4       Santantines
           ...     
8688      Noxnuther
8689      Mondalley
8690         Connon
8691      Hontichre
8692      Hontichre
Name: last_name, Length: 8693, dtype: object

In [24]:
# space['last_name'] = space['name_split'].str.get(1)
# space['last_name']

In [25]:
space[space['last_name'].duplicated()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,last_name
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,Susent
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True,0006,Jacostaffey
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True,0008,Flatic
11,0008_03,Europa,False,B/1/P,55 Cancri e,45.0,False,39.0,7295.0,589.0,110.0,124.0,Wezena Flatic,True,0008,Flatic
20,0017_02,Earth,False,F/6/P,55 Cancri e,14.0,False,412.0,0.0,1.0,0.0,679.0,Philda Brighttt,False,0017,Brighttt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,Hontichre


In [26]:
name_planet = space.groupby(['last_name','HomePlanet']).size().reset_index()

In [27]:
name_dic = {}
for k, v in zip(name_planet['last_name'], name_planet['HomePlanet']):
    name_dic[k] = v
name_dic

{'Acobson': 'Earth',
 'Acobsond': 'Earth',
 'Adavisons': 'Earth',
 'Adkinson': 'Earth',
 'Admingried': 'Europa',
 'Aginge': 'Europa',
 'Ailled': 'Europa',
 'Aillyber': 'Europa',
 'Aiming': 'Europa',
 'Ainatint': 'Europa',
 'Aindlylid': 'Europa',
 'Ainserfle': 'Europa',
 'Airdring': 'Europa',
 'Aivering': 'Europa',
 'Alaring': 'Europa',
 'Alaxed': 'Europa',
 'Alberts': 'Earth',
 'Alcemblery': 'Europa',
 'Alenat': 'Europa',
 'Alenter': 'Europa',
 'Alentonway': 'Earth',
 'Alest': 'Earth',
 'Alfordonard': 'Earth',
 'Alindiveng': 'Europa',
 'Alldson': 'Earth',
 'Aloubtled': 'Europa',
 'Alshipson': 'Earth',
 'Alvasquez': 'Earth',
 'Alvercal': 'Europa',
 'Alvesssidy': 'Europa',
 'Ambleetive': 'Europa',
 'Ambleeve': 'Europa',
 'Amblereld': 'Europa',
 'Ametic': 'Europa',
 'Amincrerus': 'Europa',
 'Amonsmane': 'Europa',
 'Amonysidle': 'Europa',
 'Amoutake': 'Europa',
 'Amspring': 'Europa',
 'Anake': 'Mars',
 'Anche': 'Mars',
 'Ancontaked': 'Europa',
 'Ancy': 'Mars',
 'Andackson': 'Earth',
 'Ande

In [28]:
space['HomePlanet'].fillna(space['last_name'].map(name_dic), inplace=True)

In [29]:
space['HomePlanet'].isnull().sum()

12

In [30]:
nan_planet = space[space['HomePlanet'].isnull()]
nan_planet

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,last_name
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,242,Sté
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,251,Amsive
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,Ageurante
1855,1978_01,,True,G/311/S,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,,True,1978,
2274,2443_01,,False,D/72/P,TRAPPIST-1e,31.0,False,1458.0,421.0,76.0,0.0,0.0,,False,2443,
2631,2817_01,,False,F/584/P,TRAPPIST-1e,25.0,False,237.0,0.0,910.0,0.0,12.0,Sealfs Sutty,False,2817,Sutty
3091,3331_01,,False,F/631/S,TRAPPIST-1e,40.0,False,666.0,4.0,83.0,0.0,50.0,,True,3331,
4548,4840_01,,True,F/915/S,TRAPPIST-1e,36.0,False,0.0,0.0,,0.0,0.0,,True,4840,
5252,5603_01,,False,E/365/S,TRAPPIST-1e,34.0,False,170.0,1256.0,0.0,3926.0,7121.0,Kocha Cluitty,False,5603,Cluitty
5634,5989_01,,False,F/1141/S,TRAPPIST-1e,20.0,False,0.0,0.0,,703.0,0.0,Darrie Holcompton,False,5989,Holcompton


In [31]:
space.groupby(['Destination', 'HomePlanet']).size()

Destination    HomePlanet
55 Cancri e    Earth          700
               Europa         902
               Mars           196
PSO J318.5-22  Earth          725
               Europa          19
               Mars            51
TRAPPIST-1e    Earth         3177
               Europa        1214
               Mars          1515
dtype: int64

In [32]:
name_dic = {}
for k, v in zip(name_planet['last_name'], name_planet['HomePlanet']):
    name_dic[k] = v
name_dic

{'Acobson': 'Earth',
 'Acobsond': 'Earth',
 'Adavisons': 'Earth',
 'Adkinson': 'Earth',
 'Admingried': 'Europa',
 'Aginge': 'Europa',
 'Ailled': 'Europa',
 'Aillyber': 'Europa',
 'Aiming': 'Europa',
 'Ainatint': 'Europa',
 'Aindlylid': 'Europa',
 'Ainserfle': 'Europa',
 'Airdring': 'Europa',
 'Aivering': 'Europa',
 'Alaring': 'Europa',
 'Alaxed': 'Europa',
 'Alberts': 'Earth',
 'Alcemblery': 'Europa',
 'Alenat': 'Europa',
 'Alenter': 'Europa',
 'Alentonway': 'Earth',
 'Alest': 'Earth',
 'Alfordonard': 'Earth',
 'Alindiveng': 'Europa',
 'Alldson': 'Earth',
 'Aloubtled': 'Europa',
 'Alshipson': 'Earth',
 'Alvasquez': 'Earth',
 'Alvercal': 'Europa',
 'Alvesssidy': 'Europa',
 'Ambleetive': 'Europa',
 'Ambleeve': 'Europa',
 'Amblereld': 'Europa',
 'Ametic': 'Europa',
 'Amincrerus': 'Europa',
 'Amonsmane': 'Europa',
 'Amonysidle': 'Europa',
 'Amoutake': 'Europa',
 'Amspring': 'Europa',
 'Anake': 'Mars',
 'Anche': 'Mars',
 'Ancontaked': 'Europa',
 'Ancy': 'Mars',
 'Andackson': 'Earth',
 'Ande

In [33]:
p_dict = {'55 Cancri e' : 'Europa', 'PSO J318.5-22' : 'Earth', 'TRAPPIST-1e' : 'Earth'}

In [34]:
space['HomePlanet'].fillna(space['Destination'].map(p_dict), inplace=True)

In [35]:
space['HomePlanet'].isnull().sum()

0

## VIP Null처리

In [36]:
space['VIP'].value_counts(dropna=False)

False    8291
NaN       203
True      199
Name: VIP, dtype: int64

In [37]:
space['VIP'].fillna(False, inplace=True)

In [38]:
space['VIP'].isnull().sum()

0

In [40]:
fee_list = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [78]:
for col in fee_list :
    space[col] = space[col].replace(np.nan, 0.0)

In [99]:
space['fee'] = space['RoomService'] + space['FoodCourt'] + space['ShoppingMall'] + space['Spa'] + space['VRDeck']

In [100]:
space.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
fee             0
col             0
deck            0
side            0
dtype: int64

-------

In [43]:
space['Cabin']

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object

In [44]:
space['deck'] = space['Cabin'].str.split('/').str[0]

In [45]:
space[space['deck'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,last_name,fee,col,deck
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,0012,Pooles,908.0,0.0,
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0101,Trad,0.0,0.0,
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,0110,Aloubtled,5109.0,764.0,
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,0239,Resty,1048.0,319.0,
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0244,Sad,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False,8772,Motled,,0.0,
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,9057,Statch,7397.0,0.0,
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,9069,Brakeng,0.0,0.0,
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False,9081,Clemondsey,0.0,0.0,


In [46]:
space.groupby(['HomePlanet', 'deck']).size()

HomePlanet  deck
Earth       D          1
            E        404
            F       1652
            G       2559
Europa      A        256
            B        779
            C        747
            D        192
            E        133
            T          5
Mars        D        285
            E        339
            F       1142
dtype: int64

In [47]:
space['deck'].fillna(space['HomePlanet'].map({'Earth': 'G', 'Europa': 'B', 'Mars' : 'F'}), inplace=True)

In [48]:
space['deck'].isnull().sum()

0

In [49]:
space['side'] = space['Cabin'].str.split('/').str[-1]

In [50]:
side_group = space.groupby(['Group', 'side']).size().reset_index()

In [51]:
side_group[side_group['Group'].duplicated()]

Unnamed: 0,Group,side,0


In [52]:
side_dic = {}

for key, value in zip(side_group['Group'], side_group['side']):
    side_dic[key] = value

In [53]:
space['side'].fillna(space['Group'].map(side_dic), inplace = True)

In [54]:
space['side'].isnull().sum()

99

In [55]:
space['side'].value_counts()

S    4343
P    4251
Name: side, dtype: int64

In [56]:
space['side'].fillna('S', inplace = True)

In [57]:
space['side'].isnull().sum()

0

In [58]:
space.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Group             0
last_name       200
fee             908
col               0
deck              0
side              0
dtype: int64

In [59]:
dest_group = space.groupby(['Group', 'Destination']).size().reset_index()

In [62]:
dest_group.sort_values(0, ascending = False)

Unnamed: 0,Group,Destination,0
6225,8383,TRAPPIST-1e,7
1259,1709,TRAPPIST-1e,7
3175,4256,TRAPPIST-1e,7
4502,6030,TRAPPIST-1e,7
6513,8796,TRAPPIST-1e,7
...,...,...,...
2490,3365,PSO J318.5-22,1
2489,3364,55 Cancri e,1
2487,3363,PSO J318.5-22,1
2485,3361,PSO J318.5-22,1


In [66]:
dest_group=dest_group.drop_duplicates()

In [68]:
dest_dict = {}
for g, d in zip(dest_group['Group'], dest_group['Destination']) :
    dest_dict[g] = d

In [69]:
space['Destination'].fillna(space['Group'].map(dest_dict), inplace = True)

In [70]:
space['Destination'].isnull().sum()

103

In [72]:
space.groupby(['HomePlanet','Destination']).size().reset_index()

Unnamed: 0,HomePlanet,Destination,0
0,Earth,55 Cancri e,701
1,Earth,PSO J318.5-22,732
2,Earth,TRAPPIST-1e,3210
3,Europa,55 Cancri e,908
4,Europa,PSO J318.5-22,19
5,Europa,TRAPPIST-1e,1231
6,Mars,55 Cancri e,198
7,Mars,PSO J318.5-22,51
8,Mars,TRAPPIST-1e,1540


In [71]:
space['Destination'].value_counts()

TRAPPIST-1e      5981
55 Cancri e      1807
PSO J318.5-22     802
Name: Destination, dtype: int64

In [73]:
space['Destination'].fillna('TRAPPIST-1e' ,inplace=True)

In [75]:
space['Destination'].isnull().sum()

0

In [81]:
space[space['CryoSleep']==True]['fee'].mean()

0.0

In [84]:
space[space['CryoSleep'].isnull()]['fee']

92         0.0
98       703.0
104     2018.0
111        NaN
152      990.0
         ...  
8620       0.0
8651       0.0
8664       0.0
8675       NaN
8687    3540.0
Name: fee, Length: 217, dtype: float64

In [85]:
space['CryoSleep'] = np.where(space['fee'] > 0, False, True)

In [86]:
space['CryoSleep'].isnull().sum()

0

In [88]:
space.groupby('Transported')['Age'].median()

Transported
False    27.0
True     26.0
Name: Age, dtype: float64

In [89]:
space['Age'].fillna(space['Age'].median(), inplace=True)

In [91]:
space['Age'].isnull().sum()

0

In [92]:
space.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
last_name       200
fee             908
col               0
deck              0
side              0
dtype: int64

In [94]:
del_col = ['PassengerId', 'Cabin', 'Name', 'last_name']

In [95]:
space = space.drop(del_col, axis=1)

In [98]:
space.isnull().sum()

HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
Group             0
fee             908
col               0
deck              0
side              0
dtype: int64

In [104]:
space.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   bool   
 2   Destination   8693 non-null   object 
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   bool   
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Group         8693 non-null   object 
 12  fee           8693 non-null   float64
 13  col           8693 non-null   float64
 14  deck          8693 non-null   object 
 15  side          8693 non-null   object 
dtypes: bool(3), float64(8), object(5)
memory usage: 1.2+ MB


In [106]:
space['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars'], dtype=object)

In [None]:
col_list = space[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

In [None]:
for col in col_list :
    #space[col] = np.where(space[col] == np.nan, 0, space[col])
    space[col] = space[col].replace(np.nan, 0.0)

In [None]:
space.isnull().sum()

In [None]:
space = space.astype({'PassengerId' : 'int', 'HomePlanet' : 'int', 'CryoSleep' : 'int'})

print(space.dtypes)

In [None]:
space.describe()

In [None]:
space['HomePlanet'].value_counts()

In [None]:
space['CryoSleep'].value_counts()

In [None]:
space['Destination'].value_counts()

In [None]:
space['Age'].value_counts()

In [None]:
space['VIP'].value_counts()

In [None]:
space['RoomService'].value_counts()

In [None]:
space['FoodCourt'].value_counts()

In [None]:
space['ShoppingMall'].value_counts()

In [None]:
space['FoodCourt'].value_counts()

In [None]:
space['VRDeck'].value_counts()

In [None]:
space['Transported'].value_counts()

In [None]:
space['Transported'] = space['Transported'].map(lambda x : 1 if x else 0)
space['Transported'].value_counts()

In [None]:
space.info()

## 학습

In [129]:
x = space.drop('Transported', axis=1)
y = space['Transported']
print(x.shape, y.shape)

(8693, 15) (8693,)


In [116]:
from sklearn.preprocessing import LabelEncoder

In [126]:
space.dtypes.reset_index()

Unnamed: 0,index,0
0,HomePlanet,int32
1,CryoSleep,bool
2,Destination,int32
3,Age,float64
4,VIP,bool
5,RoomService,float64
6,FoodCourt,float64
7,ShoppingMall,float64
8,Spa,float64
9,VRDeck,float64


In [125]:
space.dtypes.index

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'Group',
       'fee', 'col', 'deck', 'side'],
      dtype='object')

In [119]:
cola = []
for col, types in zip(space.dtypes.index, space.dtypes.values):
    if (types == 'object') :
        cola.append(col)

In [121]:
for col in cola :
    label = LabelEncoder()
    space[col] = label.fit_transform(space[col])

In [None]:
with open(labelen, 'wb') as f:
    pickle.dump(labelen, f)

### DecisionTreeClassifier()

In [130]:
x = space.drop('Transported', axis=1)
y = space['Transported']
print(x.shape, y.shape)

(8693, 15) (8693,)


In [131]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [132]:
model.fit(x_train, y_train)
print('train : ', model.score(x_train, y_train))
print('test : ', model.score(x_test, y_test)) 

train :  0.999137187230371
test :  0.7561817136285222


In [133]:
model.predict(x_test)

array([False,  True,  True, ..., False,  True, False])

#### DecisionTreeClassifier() 트리 높이 제한

In [134]:
model2 = DecisionTreeClassifier(max_depth=4)
model2.fit(x_train, y_train)
print('train : ', model2.score(x_train, y_train))
print('test : ', model2.score(x_test, y_test)) 

train :  0.7845844118492954
test :  0.7584818861414606
