In [93]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
#모델
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from xgboost import XGBClassifier
from joblib import dump,load


In [2]:
data=pd.read_csv('train.csv')
real=pd.read_csv('test.csv')
submit=pd.read_csv('sample_submission.csv')

In [3]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [4]:
data.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
data.groupby(data.Transported)['Age'].mean()

Transported
False    29.922858
True     27.748834
Name: Age, dtype: float64

In [6]:
data.groupby(['VIP','Transported']).size()

VIP    Transported
False  False          4093
       True           4198
True   False           123
       True             76
dtype: int64

In [7]:
data.groupby(['Transported','HomePlanet']).size()

Transported  HomePlanet
False        Earth         2651
             Europa         727
             Mars           839
True         Earth         1951
             Europa        1404
             Mars           920
dtype: int64

In [8]:
data['Group']=data['PassengerId'].str[:4]

In [9]:
home_group=data.groupby(['Group','HomePlanet']).size().reset_index()
home_group

Unnamed: 0,Group,HomePlanet,0
0,0001,Europa,1
1,0002,Earth,1
2,0003,Europa,2
3,0004,Earth,1
4,0005,Earth,1
...,...,...,...
6102,9275,Europa,3
6103,9276,Europa,1
6104,9278,Earth,1
6105,9279,Earth,1


In [10]:
home_group['Group']

0       0001
1       0002
2       0003
3       0004
4       0005
        ... 
6102    9275
6103    9276
6104    9278
6105    9279
6106    9280
Name: Group, Length: 6107, dtype: object

In [11]:
home_group['HomePlanet']

0       Europa
1        Earth
2       Europa
3        Earth
4        Earth
         ...  
6102    Europa
6103    Europa
6104     Earth
6105     Earth
6106    Europa
Name: HomePlanet, Length: 6107, dtype: object

In [12]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Group             0
dtype: int64

In [13]:
home_group[0].sum()

8492

In [14]:
home_dict={}
for i,j  in zip(home_group['Group'],home_group['HomePlanet']):
    home_dict[i]=j

In [15]:
home_dict

{'0001': 'Europa',
 '0002': 'Earth',
 '0003': 'Europa',
 '0004': 'Earth',
 '0005': 'Earth',
 '0006': 'Earth',
 '0007': 'Earth',
 '0008': 'Europa',
 '0009': 'Mars',
 '0010': 'Earth',
 '0011': 'Earth',
 '0012': 'Earth',
 '0014': 'Mars',
 '0015': 'Earth',
 '0016': 'Mars',
 '0017': 'Earth',
 '0020': 'Earth',
 '0022': 'Mars',
 '0024': 'Europa',
 '0025': 'Earth',
 '0026': 'Europa',
 '0028': 'Mars',
 '0030': 'Earth',
 '0031': 'Mars',
 '0034': 'Europa',
 '0035': 'Mars',
 '0036': 'Earth',
 '0038': 'Earth',
 '0039': 'Earth',
 '0041': 'Earth',
 '0043': 'Europa',
 '0044': 'Earth',
 '0045': 'Mars',
 '0050': 'Earth',
 '0051': 'Earth',
 '0052': 'Earth',
 '0053': 'Earth',
 '0056': 'Europa',
 '0058': 'Earth',
 '0061': 'Earth',
 '0062': 'Earth',
 '0064': 'Mars',
 '0066': 'Earth',
 '0067': 'Earth',
 '0068': 'Mars',
 '0069': 'Earth',
 '0070': 'Earth',
 '0071': 'Earth',
 '0072': 'Earth',
 '0073': 'Mars',
 '0074': 'Europa',
 '0076': 'Mars',
 '0077': 'Mars',
 '0078': 'Europa',
 '0081': 'Earth',
 '0082': 'Mar

In [16]:
data['Group'].map(home_dict)

0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: Group, Length: 8693, dtype: object

In [17]:
data['HomePlanet'].fillna(data['Group'].map(home_dict),inplace=True)

In [18]:
data['HomePlanet'].isnull().sum()

111

In [19]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280


In [20]:
data['Name2']=data.Name.str.split(' ').str[-1]
data['Name2']

0         Ofracculy
1             Vines
2            Susent
3            Susent
4       Santantines
           ...     
8688      Noxnuther
8689      Mondalley
8690         Connon
8691      Hontichre
8692      Hontichre
Name: Name2, Length: 8693, dtype: object

In [21]:
name_dict={}
for i,j  in zip(data['Name2'],home_group['HomePlanet']):
    name_dict[i]=j

In [22]:
data['HomePlanet'].fillna(data['Name2'].map(name_dict),inplace=True)

In [23]:
data['HomePlanet'].fillna('Earth',inplace=True)

In [24]:
data['HomePlanet'].isnull().sum()

0

In [25]:
data['VIP'].fillna(False,inplace=True)

In [26]:
data['VIP'].isnull().sum()

0

In [27]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'Group', 'Name2'],
      dtype='object')

In [28]:
data['Fee']=(data['RoomService']+ data['FoodCourt']+ data['ShoppingMall']+ data['Spa']+ data['VRDeck'])

In [29]:
fee_list=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [30]:
for col in fee_list:
    print(col)
    data[col].fillna(data[col].median(),inplace=True)

RoomService
FoodCourt
ShoppingMall
Spa
VRDeck


In [31]:
data['Fee'].fillna(data.RoomService.median(),inplace=True)

In [32]:
data['Fee'].isnull().sum()

0

In [33]:
data['Cabin']

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object

In [34]:
data['deck']=data['Cabin'].str.split('/').str[0]

In [35]:
data['side']=data['Cabin'].str.split('/').str[-1]

In [36]:
data.groupby(['HomePlanet','deck']).size()

HomePlanet  deck
Earth       B          1
            C          2
            D          3
            E        407
            F       1647
            G       2543
            T          1
Europa      A        255
            B        778
            C        744
            D        190
            E        131
            F          7
            G          7
            T          4
Mars        A          1
            C          1
            D        285
            E        338
            F       1140
            G          9
dtype: int64

In [37]:
data['deck'].fillna(data['HomePlanet'].map({'Earth':'G','Europa':'C','Mars':'F'}),inplace=True)

In [38]:
data['deck'].isnull().sum()

0

In [39]:
data['HomePlanet'].map({'Earth':'G','Europa':'C','Mars':'F'})

0       C
1       G
2       C
3       C
4       G
       ..
8688    C
8689    G
8690    G
8691    C
8692    C
Name: HomePlanet, Length: 8693, dtype: object

In [40]:
data.Group

0       0001
1       0002
2       0003
3       0003
4       0004
        ... 
8688    9276
8689    9278
8690    9279
8691    9280
8692    9280
Name: Group, Length: 8693, dtype: object

In [41]:
side_group=data.groupby(['Group','side']).size().reset_index()

In [42]:
side_group

Unnamed: 0,Group,side,0
0,0001,P,1
1,0002,S,1
2,0003,S,2
3,0004,S,1
4,0005,P,1
...,...,...,...
6113,9275,P,3
6114,9276,P,1
6115,9278,S,1
6116,9279,S,1


In [43]:
side_dict={}
for i,j in zip(side_group['Group'],side_group['side']):
    side_dict[i]=j

In [44]:
data['side'].fillna(data['Group'].map(side_dict),inplace=True)

In [45]:
data['side'].isnull().sum()

99

In [46]:
data['side'].fillna('S',inplace=True)

In [47]:
dest_group=data.groupby(['Group','Destination']).size().reset_index()
dest_group

Unnamed: 0,Group,Destination,0
0,0001,TRAPPIST-1e,1
1,0002,TRAPPIST-1e,1
2,0003,TRAPPIST-1e,2
3,0004,TRAPPIST-1e,1
4,0005,PSO J318.5-22,1
...,...,...,...
6875,9276,55 Cancri e,1
6876,9278,PSO J318.5-22,1
6877,9279,TRAPPIST-1e,1
6878,9280,55 Cancri e,1


In [48]:
dest_group[dest_group['Group'].duplicated()]

Unnamed: 0,Group,Destination,0
8,0008,TRAPPIST-1e,1
17,0017,TRAPPIST-1e,1
19,0020,PSO J318.5-22,1
20,0020,TRAPPIST-1e,3
36,0044,PSO J318.5-22,1
...,...,...,...
6836,9219,TRAPPIST-1e,1
6838,9220,TRAPPIST-1e,2
6844,9227,TRAPPIST-1e,3
6847,9231,TRAPPIST-1e,2


In [49]:
dest_group=dest_group.sort_values(0,ascending=False)

In [50]:
dest_group=dest_group.drop_duplicates()

In [51]:
dest_dict={}
for i,j in zip(dest_group['Group'],dest_group['Destination']):
    dest_dict[i]=j

In [52]:
data['Destination'].fillna(data['Group'].map(dest_dict),inplace=True)

In [53]:
data['Destination'].isnull().sum()

103

In [54]:
data['Destination'].fillna('TRAPPIST-1e',inplace=True)

In [55]:
data['Destination'].isnull().sum()

0

In [56]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination       0
Age             179
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
Fee               0
deck              0
side              0
dtype: int64

In [57]:
data[data['CryoSleep'].isnull()]['Fee']

92         0.0
98       703.0
104     2018.0
111        0.0
152      990.0
         ...  
8620       0.0
8651       0.0
8664       0.0
8675       0.0
8687    3540.0
Name: Fee, Length: 217, dtype: float64

In [58]:
data['CryoSleep']=np.where(data['Fee']>0,False,True)

In [59]:
data['CryoSleep'].isnull().sum()

0

In [60]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'Group', 'Name2', 'Fee', 'deck', 'side'],
      dtype='object')

In [61]:
data['Age'].fillna(data['Age'].median(),inplace=True)

In [62]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
Fee               0
deck              0
side              0
dtype: int64

In [63]:
del_col=['PassengerId','Name','Name2','Cabin']

In [64]:
data=data.drop(del_col,axis=1)

In [65]:
data

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Fee,deck,side
0,Europa,True,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0001,0.0,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0002,736.0,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0003,10383.0,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0003,5176.0,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,0004,1091.0,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,9276,8536.0,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,9278,0.0,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,9279,1873.0,G,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,9280,4637.0,E,S


In [66]:
data.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
Fee             0
deck            0
side            0
dtype: int64

In [67]:
value=data.dtypes.values
index=data.dtypes.index

In [68]:
value

array([dtype('O'), dtype('bool'), dtype('O'), dtype('float64'),
       dtype('bool'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('bool'), dtype('O'), dtype('float64'), dtype('O'),
       dtype('O')], dtype=object)

In [69]:
obj_list=[]
for i,j in list(zip(index,value)):
    if j=='object':
        obj_list.append(i)

In [70]:
obj_list

['HomePlanet', 'Destination', 'Group', 'deck', 'side']

In [71]:
bool_list=[]
for i,j in list(zip(index,value)):
    if j=='bool':
        bool_list.append(i)

In [72]:
bool_list

['CryoSleep', 'VIP', 'Transported']

In [73]:
label=LabelEncoder()

In [74]:
for col in obj_list:
    label=LabelEncoder()
    data[col]=label.fit_transform(data[col])

In [75]:
for col in bool_list:
    label=LabelEncoder()
    data[col]=label.fit_transform(data[col])

In [76]:
data.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
Fee             0
deck            0
side            0
dtype: int64

In [87]:
data

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Fee,deck,side
0,1,1,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1,0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1,736.0,5,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,2,10383.0,0,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,2,5176.0,0,1
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1,3,1091.0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,6213,8536.0,0,0
8689,0,1,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0,6214,0.0,6,1
8690,0,0,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,6215,1873.0,6,1
8691,1,0,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,6216,4637.0,4,1


In [77]:
X=data.drop('Transported',axis=1)
Y=data['Transported']

In [78]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [79]:
model1=RandomForestClassifier()
model2=LogisticRegression()
model3=SVC(probability=True)
model4=DecisionTreeClassifier()

In [80]:
model1.fit(x_train,y_train)

RandomForestClassifier()

In [81]:
model1.score(x_train,y_train)

0.999137187230371

In [82]:
model1.score(x_test,y_test)

0.7872340425531915

In [97]:
ss=ShuffleSplit(test_size=0.2,n_splits=10)

In [94]:
model=XGBClassifier()

In [96]:
param_grid={'n_estimators':[100,200,300,400,500],
           'learning_rate':[0.01,0.05,0.1,0.15,0.2],
           'max_depth':range(11),
           'gamma':[0,1,2,3,4,5],
           'sub_sample':[0.5,0.6,0.7,0.8,1]}

In [98]:
grid_xgb=GridSearchCV(model,param_grid=param_grid,n_jobs=-1,cv=ss)

In [100]:
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [101]:
model.score(x_train,y_train)

0.9339948231233822

In [102]:
model.score(x_test,y_test)

0.79700977573318