# Introduction

# 1. Data Exploratory 

In [38]:
import matplotlib.pyplot as plt
import seaborn as sns
import random
import numpy as np
import pandas as pd
import time
import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [39]:
train = pd.read_csv('../spaceship_titanic/train.csv')
test = pd.read_csv('../spaceship_titanic/test.csv')

test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [40]:
train.shape

(8693, 14)

In [41]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [42]:
train['VIP'] = train['VIP'].fillna(False)
train['CryoSleep'] = train['CryoSleep'].fillna(False)

test['VIP'] = test['VIP'].fillna(False)
test['CryoSleep'] = test['CryoSleep'].fillna(False)

In [43]:
train.shape

(8693, 14)

In [44]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [45]:
train['CryoSleep'].replace(False, 0, inplace=True)
train['CryoSleep'].replace(True, 1, inplace=True)
test['CryoSleep'].replace(False, 0, inplace=True)
test['CryoSleep'].replace(True, 1, inplace=True)

train['VIP'].replace(False, 0, inplace=True)
train['VIP'].replace(True, 1, inplace=True)
test['VIP'].replace(False, 0, inplace=True)
test['VIP'].replace(True, 1, inplace=True)

train['Transported'].replace(False, 0, inplace=True)
train['Transported'].replace(True, 1, inplace=True)

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8693 non-null   int64  
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8693 non-null   int64  
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   int64  
dtypes: float64(6), int64(3), object(5)
memory usage: 950.9+ KB


In [47]:
train['CryoSleep'] = train['CryoSleep'].astype('int64')
train['VIP'] = train['VIP'].astype('int64')
test['CryoSleep'] = test['CryoSleep'].astype('int64')
test['VIP'] = test['VIP'].astype('int64')

train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [48]:
train[['deck','num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck','num', 'side']] = test['Cabin'].str.split('/', expand=True)

train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [49]:
freq_deck = train.deck.dropna().mode()[0]
freq_deck

freq_num = train.num.dropna().mode()[0]
freq_num

freq_side = train.side.dropna().mode()[0]
freq_side

print(freq_deck, freq_num, freq_side)

F 82 S


In [50]:
train['Destination'] = train['Destination'].fillna('TRAPPIST-1e')
test['Destination'] = test['Destination'].fillna('TRAPPIST-1e')

train['HomePlanet'] = train['HomePlanet'].fillna('Earth')
test['HomePlanet'] = test['HomePlanet'].fillna('Earth')

train['deck'] = train['deck'].fillna('F')
test['deck'] = test['deck'].fillna('F')

train['num'] = train['num'].fillna(82)
test['num'] = test['num'].fillna(82)

train['side'] = train['side'].fillna('S')
test['side'] = test['side'].fillna('S')

train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())

In [51]:
dest_mapping = {"55 Cancri e": 1, "PSO J318.5-22": 2, "TRAPPIST-1e": 3}
train['Destination'] = train['Destination'].map(dest_mapping)
test['Destination'] = test['Destination'].map(dest_mapping)
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side
0,0001_01,Europa,0,3,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,0002_01,Earth,0,3,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S
2,0003_01,Europa,0,3,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,0003_02,Europa,0,3,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,0004_01,Earth,0,3,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S


In [52]:
train[["HomePlanet", "Transported"]].groupby(['HomePlanet'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,HomePlanet,Transported
1,Europa,0.658846
2,Mars,0.523024
0,Earth,0.427649


In [53]:
dest_mapping = {"Europa": 1, "Mars": 2, "Earth": 3}
train['HomePlanet'] = train['HomePlanet'].map(dest_mapping)
test['HomePlanet'] = test['HomePlanet'].map(dest_mapping)
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side
0,0001_01,1,0,3,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,0002_01,3,0,3,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S
2,0003_01,1,0,3,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,0003_02,1,0,3,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,0004_01,3,0,3,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S


In [54]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   int64  
 2   CryoSleep     8693 non-null   int64  
 3   Destination   8693 non-null   int64  
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   int64  
 12  deck          8693 non-null   object 
 13  num           8693 non-null   object 
 14  side          8693 non-null   object 
dtypes: float64(6), int64(5), object(4)
memory usage: 1018.8+ KB


In [55]:
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Transported']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Transported
0,"(-0.079, 15.8]",0.648387
1,"(15.8, 31.6]",0.475495
2,"(31.6, 47.4]",0.48551
3,"(47.4, 63.2]",0.486747
4,"(63.2, 79.0]",0.463235


In [56]:
train.loc[ train['Age'] <= 15.8, 'Age'] = 0
train.loc[(train['Age'] > 15.8) & (train['Age'] <= 31.6), 'Age'] = 1
train.loc[(train['Age'] > 31.6) & (train['Age'] <= 47.4), 'Age'] = 2
train.loc[(train['Age'] > 47.4) & (train['Age'] <= 63.2), 'Age'] = 3
train.loc[ train['Age'] > 63.2, 'Age'] = 4
train['Age'] = train['Age'].astype('int64')

test.loc[ test['Age'] <= 15.8, 'Age'] = 0
test.loc[(test['Age'] > 15.8) & (test['Age'] <= 31.6), 'Age'] = 1
test.loc[(test['Age'] > 31.6) & (test['Age'] <= 47.4), 'Age'] = 2
test.loc[(test['Age'] > 47.4) & (test['Age'] <= 63.2), 'Age'] = 3
test.loc[ test['Age'] > 63.2, 'Age'] = 4
test['Age'] = test['Age'].astype('int64')

train = train.drop(['AgeBand'], axis=1)

In [57]:
train[['Age', 'Transported']].groupby(['Age'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,Age,Transported
0,0,0.648387
3,3,0.486747
2,2,0.48551
1,1,0.475495
4,4,0.463235


In [58]:
train[['deck', 'Transported']].groupby(['deck'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,deck,Transported
1,B,0.734275
2,C,0.680054
6,G,0.516217
0,A,0.496094
5,F,0.444036
3,D,0.433054
4,E,0.357306
7,T,0.2


In [59]:
train[['side', 'Transported']].groupby(['side'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,side,Transported
1,S,0.552708
0,P,0.45126


In [60]:
deck_mapping = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 5, "F": 6, "G": 7, "T": 8}
train['deck'] = train['deck'].map(deck_mapping)
test['deck'] = test['deck'].map(deck_mapping)

side_mapping = {"P": 0, "S": 1}
train['side'] = train['side'].map(side_mapping)
test['side'] = test['side'].map(side_mapping)

In [61]:
train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side
0,0001_01,1,0,3,2,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,0002_01,3,0,3,1,0,109.0,9.0,25.0,549.0,44.0,1,6,0,1
2,0003_01,1,0,3,3,1,43.0,3576.0,0.0,6715.0,49.0,0,0,0,1
3,0003_02,1,0,3,2,0,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1
4,0004_01,3,0,3,1,0,303.0,70.0,151.0,565.0,2.0,1,6,1,1


In [62]:
test.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side
0,0013_01,3,1,3,1,0,0.0,0.0,0.0,0.0,0.0,7,3,1
1,0018_01,3,0,3,1,0,0.0,9.0,0.0,2823.0,0.0,6,4,1
2,0019_01,1,1,1,1,0,0.0,0.0,0.0,0.0,0.0,2,0,1
3,0021_01,1,0,3,2,0,0.0,6652.0,0.0,181.0,585.0,2,1,1
4,0023_01,3,0,3,1,0,10.0,0.0,635.0,0.0,0.0,6,5,1


In [63]:
train['num'] = train['num'].astype('int64')
test['num'] = test['num'].astype('int64')

In [64]:
train['Group'] = train['PassengerId'].apply(
    lambda x: x.split('_')[0]).astype(int)
test['Group'] = test['PassengerId'].apply(
    lambda x: x.split('_')[0]).astype(int)

train['GroupSize'] = train['Group'].map(lambda x: pd.concat(
    [train['Group'], test['Group']]).value_counts()[x])
test['GroupSize'] = test['Group'].map(lambda x: pd.concat(
    [train['Group'], test['Group']]).value_counts()[x])

train['Solo'] = (train['GroupSize'] == 1).astype(int)
test['Solo'] = (test['GroupSize'] == 1).astype(int)

train = train.drop(['PassengerId'], axis=1)
test = test.drop(['PassengerId'], axis=1)

In [65]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,Group,GroupSize,Solo
0,1,0,3,2,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1,1,1
1,3,0,3,1,0,109.0,9.0,25.0,549.0,44.0,1,6,0,1,2,1,1
2,1,0,3,3,1,43.0,3576.0,0.0,6715.0,49.0,0,0,0,1,3,2,0
3,1,0,3,2,0,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1,3,2,0
4,3,0,3,1,0,303.0,70.0,151.0,565.0,2.0,1,6,1,1,4,1,1


In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int64  
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   int64  
 3   Age           8693 non-null   int64  
 4   VIP           8693 non-null   int64  
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   int64  
 11  deck          8693 non-null   int64  
 12  num           8693 non-null   int64  
 13  side          8693 non-null   int64  
 14  Group         8693 non-null   int32  
 15  GroupSize     8693 non-null   int64  
 16  Solo          8693 non-null   int32  
dtypes: float64(5), int32(2), int64(10)
memory usage: 1.1 MB


In [67]:
test.isnull().sum()

HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
deck              0
num               0
side              0
Group             0
GroupSize         0
Solo              0
dtype: int64

In [68]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train['SumSpends'] = train[col_to_sum].sum(axis=1)
test['SumSpends'] = test[col_to_sum].sum(axis=1)

In [69]:
test.isnull().sum()

HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
deck              0
num               0
side              0
Group             0
GroupSize         0
Solo              0
SumSpends         0
dtype: int64

In [70]:
FeaturesSpending = ['RoomService', 'FoodCourt',
                    'ShoppingMall', 'Spa', 'VRDeck']

train['TotalSpending'] = train[FeaturesSpending].sum(axis=1)
train['ZeroSpending'] = (train['TotalSpending'] == 0).astype(int)

test['TotalSpending'] = test[FeaturesSpending].sum(axis=1)
test['ZeroSpending'] = (test['TotalSpending'] == 0).astype(int)

train = train.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'SumSpends'], axis=1)
test = test.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'SumSpends'], axis=1)

In [71]:
test.isnull().sum()

HomePlanet       0
CryoSleep        0
Destination      0
Age              0
VIP              0
deck             0
num              0
side             0
Group            0
GroupSize        0
Solo             0
TotalSpending    0
ZeroSpending     0
dtype: int64

In [72]:
test.shape

(4277, 13)

In [73]:
train

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,deck,num,side,Group,GroupSize,Solo,TotalSpending,ZeroSpending
0,1,0,3,2,0,0,1,0,0,1,1,1,0.0,1
1,3,0,3,1,0,1,6,0,1,2,1,1,736.0,0
2,1,0,3,3,1,0,0,0,1,3,2,0,10383.0,0
3,1,0,3,2,0,0,0,0,1,3,2,0,5176.0,0
4,3,0,3,1,0,1,6,1,1,4,1,1,1091.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,1,2,1,0,0,98,0,9276,1,1,8536.0,0
8689,3,1,2,1,0,0,7,1499,1,9278,1,1,0.0,1
8690,3,0,3,1,0,1,7,1500,1,9279,1,1,1873.0,0
8691,1,0,1,2,0,0,5,608,1,9280,2,0,4637.0,0


In [74]:
train = pd.get_dummies(train, columns=['Destination'])
test = pd.get_dummies(test, columns=['Destination'])

train = pd.get_dummies(train, columns=['VIP'])
test = pd.get_dummies(test, columns=['VIP'])

In [75]:
train

Unnamed: 0,HomePlanet,CryoSleep,Age,Transported,deck,num,side,Group,GroupSize,Solo,TotalSpending,ZeroSpending,Destination_1,Destination_2,Destination_3,VIP_0,VIP_1
0,1,0,2,0,1,0,0,1,1,1,0.0,1,0,0,1,1,0
1,3,0,1,1,6,0,1,2,1,1,736.0,0,0,0,1,1,0
2,1,0,3,0,0,0,1,3,2,0,10383.0,0,0,0,1,0,1
3,1,0,2,0,0,0,1,3,2,0,5176.0,0,0,0,1,1,0
4,3,0,1,1,6,1,1,4,1,1,1091.0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,2,0,0,98,0,9276,1,1,8536.0,0,1,0,0,0,1
8689,3,1,1,0,7,1499,1,9278,1,1,0.0,1,0,1,0,1,0
8690,3,0,1,1,7,1500,1,9279,1,1,1873.0,0,0,0,1,1,0
8691,1,0,2,0,5,608,1,9280,2,0,4637.0,0,1,0,0,1,0


In [76]:
test

Unnamed: 0,HomePlanet,CryoSleep,Age,deck,num,side,Group,GroupSize,Solo,TotalSpending,ZeroSpending,Destination_1,Destination_2,Destination_3,VIP_0,VIP_1
0,3,1,1,7,3,1,13,1,1,0.0,1,0,0,1,1,0
1,3,0,1,6,4,1,18,1,1,2832.0,0,0,0,1,1,0
2,1,1,1,2,0,1,19,1,1,0.0,1,1,0,0,1,0
3,1,0,2,2,1,1,21,1,1,7418.0,0,0,0,1,1,0
4,3,0,1,6,5,1,23,1,1,645.0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,3,1,2,7,1496,1,9266,2,0,0.0,1,0,0,1,1,0
4273,3,0,2,6,82,1,9269,1,1,1018.0,0,0,0,1,1,0
4274,2,1,1,3,296,0,9271,1,1,0.0,1,1,0,0,1,0
4275,1,0,1,3,297,0,9273,1,1,3203.0,0,0,0,1,1,0


# Training machine learning models

In [77]:
X = train.copy()
y = X.pop('Transported')
X=X.values
y=y.values

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(test)

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, random_state=23)

In [78]:
classifiers = {
    "CatBoost": CatBoostClassifier(task_type='GPU', verbose=False),
    "XGBoost": XGBClassifier(use_label_encoder=None),
    "LGBoost": LGBMClassifier(verbosity=-1)
}

LGBoostGrid = {
    'n_estimators': [74, 76, 78],
    'max_depth': [5, 6],
    'learning_rate': [0.098, 0.1, 0.104, 0.106, 0.108],
}

XGBoostGrid = {
    'n_estimators': [136, 137, 138, 140],
    'max_depth': [4, 6],
    'learning_rate': [0.072, 0.074, 0.075, 0.076],
}

CatBoostGrid = {
    'n_estimators': [143, 144, 145, 146, 147],
    'max_depth': [6, 9],
    'learning_rate': [0.078, 0.08, 0.082, 0.084],
}

grid = {
    "CatBoost": CatBoostGrid,
    "LGBoost": LGBoostGrid,
    "XGBoost": XGBoostGrid,
}

In [79]:
from tqdm import tqdm
BestParameters = {}
ValidationScores = pd.DataFrame({'Classifer': classifiers.keys(), 'Validation accuracy': np.zeros(
    len(classifiers)), 'Training time': np.zeros(len(classifiers))})
i = 0
for key, classifier in classifiers.items():
    print("Model Start:", key)
    start = time.time()
    classifierObject = GridSearchCV(
        estimator=classifier, param_grid=grid[key])

    classifierObject.fit(X_train, Y_train)
    ValidationScores.iloc[i, 1] = classifierObject.score(
        X_validation, Y_validation)
    BestParameters[key] = classifierObject.best_params_
    totalTime = time.time() - start
    ValidationScores.iloc[i, 2] = np.round((totalTime), 2)

    print("Time:", ValidationScores.iloc[i, 2])
    print("Parameters:", classifierObject.best_params_)
    print("Model End:", key, '\n')
    i += 1

Model Start: CatBoost
Time: 242.99
Parameters: {'learning_rate': 0.078, 'max_depth': 6, 'n_estimators': 143}
Model End: CatBoost 

Model Start: XGBoost
Time: 36.16
Parameters: {'learning_rate': 0.075, 'max_depth': 4, 'n_estimators': 140}
Model End: XGBoost 

Model Start: LGBoost
Time: 7.49
Parameters: {'learning_rate': 0.104, 'max_depth': 5, 'n_estimators': 74}
Model End: LGBoost 



In [80]:
BestParameters

{'CatBoost': {'learning_rate': 0.078, 'max_depth': 6, 'n_estimators': 143},
 'XGBoost': {'learning_rate': 0.075, 'max_depth': 4, 'n_estimators': 140},
 'LGBoost': {'learning_rate': 0.104, 'max_depth': 5, 'n_estimators': 74}}

In [81]:
pd.DataFrame(BestParameters).to_parquet('Best_Params.parquet')

In [82]:
classifiers = {
    "CatBoost": CatBoostClassifier(**BestParameters["CatBoost"], task_type='GPU', verbose=False),
    "XGBoost": XGBClassifier(**BestParameters["XGBoost"], use_label_encoder=None,),
    "LGBoost": LGBMClassifier(**BestParameters["LGBoost"], verbosity=-1),
}

In [83]:
NUMBER_OF_FOLD = 10

Predictions = {}
for key, classifier in classifiers.items():
    print("Model Start:", key)
    startTime = time.time()
    cv = StratifiedKFold(n_splits=NUMBER_OF_FOLD, shuffle=True)
    score = 0
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, X_validation = X[train_idx], X[val_idx]
        Y_train, Y_validation = y[train_idx], y[val_idx]
        classifier.fit(X_train, Y_train)
        Predictions[key] = classifier.predict(test.values)
        score += classifier.score(X_validation, Y_validation)

    totalTime = time.time() - startTime
    score = score/NUMBER_OF_FOLD

    print("Training Time:", np.round(totalTime, 2))
    print("Model End:", key, '\n')

Model Start: CatBoost
Training Time: 9.54
Model End: CatBoost 

Model Start: XGBoost
Training Time: 2.63
Model End: XGBoost 

Model Start: LGBoost
Training Time: 0.56
Model End: LGBoost 



In [84]:
d = datetime.datetime.now()
timestamp = "%02d%02d%02d-%02d%02d%02d" % (
    d.year, d.month, d.day, d.hour, d.minute, d.second)

submission = pd.read_csv('sample_submission.csv')

for key in Predictions:
    submission['Transported'] = Predictions[key]
    submission = submission.replace({0: False, 1: True})
    submission.to_csv('submission_for_spaceship_titanic-%s-%s.csv' %
                      (key, timestamp), index=False)