## Import Data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("data/train.csv")
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Categorical Columns

In [3]:
data[['Deck','Room','Side']] = data.Cabin.str.split("/",expand=True)

In [4]:

data.drop(columns=['Cabin'], inplace=True)

In [5]:
X = data.drop(columns=['PassengerId', 'Name', 'Transported', 'Room'])
y = data['Transported'].astype(int)

In [6]:
from sklearn.impute import SimpleImputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
age_imputer = SimpleImputer()
spend_imputer = SimpleImputer(strategy='constant', fill_value=0.0)
X[['HomePlanet','CryoSleep','Destination','VIP','Deck','Side']] = categorical_imputer.fit_transform(X[['HomePlanet','CryoSleep','Destination','VIP','Deck','Side']])
X[['Age']] = age_imputer.fit_transform(X[['Age']])
X[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = spend_imputer.fit_transform(X[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [7]:
X.isna().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Deck            0
Side            0
dtype: int64

In [8]:
X['CryoSleep'] = X['CryoSleep'].astype(int)
X['VIP'] = X['VIP'].astype(int)

In [9]:
X['isPort'] = np.where(X['Side'] == 'P', 1, 0)
X.drop(columns=['Side'], inplace=True)

In [10]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,isPort
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,B,1
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,F,0
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,A,0
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,A,0
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,F,0


In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
array_hot_encoded = ohe.fit_transform(X[['HomePlanet','Destination','Deck']])
cols = ohe.get_feature_names_out(['HomePlanet','Destination','Deck'])
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X.index, columns=cols)
data_other_cols = X.drop(columns=['HomePlanet','Destination','Deck'])
X_ohe = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [12]:
X_ohe.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,24.0,0,109.0,9.0,25.0,549.0,44.0,0
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,16.0,0,303.0,70.0,151.0,565.0,2.0,0


## Numerical Columns

In [13]:
import matplotlib.pyplot as plt

In [14]:
from sklearn.preprocessing import StandardScaler, RobustScaler
std = StandardScaler()
robust = RobustScaler()
X_ohe[['Age']] = std.fit_transform(X[['Age']])
X_ohe[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = robust.fit_transform(X_ohe[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [15]:
X_ohe.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0,0.709437,0,0.0,0.0,0.0,0.0,0.0,1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.336717,0,2.658537,0.147541,1.136364,10.358491,1.1,0
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,2.034566,1,1.04878,58.622951,0.0,126.698113,1.225,0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,0.290975,0,0.0,21.032787,16.863636,62.811321,4.825,0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.894666,0,7.390244,1.147541,6.863636,10.660377,0.05,0


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_ohe, y, test_size=0.3, random_state=0)

## Model

In [17]:
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False)

In [18]:
xgb.fit(X_train, y_train,early_stopping_rounds=50, eval_set=[(X_val,y_val)], eval_metric='error')

[0]	validation_0-error:0.22431
[1]	validation_0-error:0.21396
[2]	validation_0-error:0.20821
[3]	validation_0-error:0.20399
[4]	validation_0-error:0.20207
[5]	validation_0-error:0.20169
[6]	validation_0-error:0.20207
[7]	validation_0-error:0.19977
[8]	validation_0-error:0.19862
[9]	validation_0-error:0.19747
[10]	validation_0-error:0.19517
[11]	validation_0-error:0.19479
[12]	validation_0-error:0.19862
[13]	validation_0-error:0.19709
[14]	validation_0-error:0.19632
[15]	validation_0-error:0.19632
[16]	validation_0-error:0.19632
[17]	validation_0-error:0.20054
[18]	validation_0-error:0.19900
[19]	validation_0-error:0.19824
[20]	validation_0-error:0.19709
[21]	validation_0-error:0.19479
[22]	validation_0-error:0.19440
[23]	validation_0-error:0.19517
[24]	validation_0-error:0.19479
[25]	validation_0-error:0.19555
[26]	validation_0-error:0.19594
[27]	validation_0-error:0.19402
[28]	validation_0-error:0.19402
[29]	validation_0-error:0.19440
[30]	validation_0-error:0.19479
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [21]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(xgb, 
             param_grid={"max_depth":[100,500,1000], "learning_rate":[0.001, 0.01, 0.1], "n_estimators":[50,100,200]}, scoring='roc_auc')

In [22]:
search = gs.fit(X_train, y_train, verbose=1)











In [23]:
search.best_params_

{'learning_rate': 0.1, 'max_depth': 100, 'n_estimators': 50}

In [24]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=100, n_estimators=50)

In [26]:
xgb.fit(X_train, y_train,early_stopping_rounds=100, eval_set=[(X_val,y_val)], eval_metric='error')

[0]	validation_0-error:0.23198
[1]	validation_0-error:0.23006
[2]	validation_0-error:0.23006
[3]	validation_0-error:0.22393
[4]	validation_0-error:0.22393
[5]	validation_0-error:0.22048
[6]	validation_0-error:0.21932
[7]	validation_0-error:0.21818
[8]	validation_0-error:0.21818
[9]	validation_0-error:0.21741
[10]	validation_0-error:0.21549
[11]	validation_0-error:0.21242
[12]	validation_0-error:0.20897
[13]	validation_0-error:0.21012
[14]	validation_0-error:0.20936
[15]	validation_0-error:0.21089
[16]	validation_0-error:0.21012
[17]	validation_0-error:0.21051
[18]	validation_0-error:0.21089
[19]	validation_0-error:0.21012
[20]	validation_0-error:0.21204
[21]	validation_0-error:0.20974
[22]	validation_0-error:0.20667
[23]	validation_0-error:0.20667
[24]	validation_0-error:0.20284
[25]	validation_0-error:0.20245
[26]	validation_0-error:0.20169
[27]	validation_0-error:0.20015
[28]	validation_0-error:0.19939
[29]	validation_0-error:0.20130
[30]	validation_0-error:0.19977
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=100, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [80]:
X_test_data = pd.read_csv("data/test.csv")
X_test = X_test_data
X_test_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [69]:
X_test[['Deck','Room','Side']] = X_test.Cabin.str.split("/",expand=True)
X_test.drop(columns=['PassengerId', 'Name', 'Room', 'Cabin'], inplace=True)

In [70]:
X_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S


In [71]:
X_test[['HomePlanet','CryoSleep','Destination','VIP','Deck','Side']] = categorical_imputer.fit_transform(X_test[['HomePlanet','CryoSleep','Destination','VIP','Deck','Side']])
X_test[['Age']] = age_imputer.fit_transform(X_test[['Age']])
X_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = spend_imputer.fit_transform(X_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [72]:
X_test[['Age']] = std.fit_transform(X_test[['Age']])
X_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = robust.fit_transform(X_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [73]:
X_test['CryoSleep'] = X_test['CryoSleep'].astype(int)
X_test['VIP'] = X_test['VIP'].astype(int)
X_test['isPort'] = np.where(X_test['Side'] == 'P', 1, 0)
X_test.drop(columns=['Side'], inplace=True)

In [74]:
X_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,isPort
0,Earth,1,TRAPPIST-1e,-0.118222,0,0.0,0.0,0.0,0.0,0.0,G,0
1,Earth,0,TRAPPIST-1e,-0.688601,0,0.0,0.136364,0.0,65.651163,0.0,F,0
2,Europa,1,55 Cancri e,0.166968,0,0.0,0.0,0.0,0.0,0.0,C,0
3,Europa,0,TRAPPIST-1e,0.666051,0,0.0,100.787879,0.0,4.209302,18.870968,C,0
4,Earth,0,TRAPPIST-1e,-0.617304,0,0.208333,0.0,23.518519,0.0,0.0,F,0


In [75]:
array_hot_encoded = ohe.fit_transform(X_test[['HomePlanet','Destination','Deck']])
cols = ohe.get_feature_names_out(['HomePlanet','Destination','Deck'])
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X_test.index, columns=cols)
data_other_cols = X_test.drop(columns=['HomePlanet','Destination','Deck'])
X_test_ohe = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [76]:
xgb.predict(X_test_ohe)

array([1, 0, 1, ..., 1, 1, 1])

In [83]:
pred = pd.DataFrame(np.where(xgb.predict(X_test_ohe) == 1, True, False))

In [84]:
pred

Unnamed: 0,0
0,True
1,False
2,True
3,True
4,False
...,...
4272,True
4273,False
4274,True
4275,True


In [85]:
X_test_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [87]:
out = pd.concat([X_test_data['PassengerId'], pred], axis=1).rename(columns={0:'Transported'})

In [89]:
out.to_csv("data/submission.csv", header=True, index=False)