<a href="https://www.kaggle.com/code/mhb757/spaceship-titanic-cleaning-engineering-modelling?scriptVersionId=218445425" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
#IMPORT MODULES#
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
#LOAD DATA
traindata  = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
testdata = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [3]:
#INITIAL CHECK
traindata.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


<br />
<br />
<br />

-----
-----


# MISSING VALUES

In [5]:
#CHECK MISSING VALUES
traindata.isnull().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

<br />
<br />

-----



#### First, examining the Expense Features...

In [6]:
traindata[traindata.RoomService.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,0.0,0.0,Mael Brantuarez,False
35,0031_03,Mars,False,F/9/P,TRAPPIST-1e,20.0,False,,0.0,1750.0,990.0,0.0,Dontch Datie,True
83,0091_01,Earth,True,G/16/S,TRAPPIST-1e,26.0,False,,0.0,0.0,0.0,0.0,Deanne Yorkland,True
132,0141_01,Mars,False,F/30/P,TRAPPIST-1e,31.0,False,,0.0,97.0,0.0,0.0,Pyrohs Harte,False
170,0193_02,Mars,False,F/41/P,TRAPPIST-1e,23.0,False,,0.0,8.0,1072.0,0.0,Frook Raf,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8425,8998_02,Earth,False,E/591/S,TRAPPIST-1e,47.0,False,,1.0,0.0,967.0,5.0,Jonaye Englence,False
8450,9026_01,Earth,True,G/1463/P,TRAPPIST-1e,58.0,False,,0.0,0.0,0.0,0.0,Mathy Boyers,True
8525,9101_01,Earth,False,F/1865/P,TRAPPIST-1e,21.0,False,,0.0,496.0,430.0,0.0,Gera Frazie,False
8534,9112_01,Mars,False,D/290/P,TRAPPIST-1e,28.0,False,,0.0,0.0,0.0,0.0,Wealke Brin,False


* Immediately we can see that if CryoSleep is true, all expenses should be zero....
* Aside from this, there is no clear pattern in expenses, so a fill based on any of the other expense values doesn't seem credible.
* So, we will go for a simple zero fill for all missing values

In [7]:
#FILLING SOME MISSING EXPENSE VALUES USING THIS ASSUMPTION
traindata.RoomService.fillna(0, inplace=True)
traindata.FoodCourt.fillna(0, inplace=True)
traindata.ShoppingMall.fillna(0, inplace=True)
traindata.Spa.fillna(0, inplace=True)
traindata.VRDeck.fillna(0, inplace=True)

#FIRST ITERATION/TRIAL - FILL ALL MISSING WITH ZERO#

In [8]:
#CREATING TOTAL SPEND FEATURE FOR EASE OF USE
traindata['TotalSpend'] = traindata['FoodCourt'] + traindata['RoomService'] + traindata['Spa'] + traindata['VRDeck'] + traindata['ShoppingMall']


<br />
<br />

----------

#### Now, Examining the CryoSleep feature...

In [9]:
traindata[traindata.CryoSleep.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend
92,0099_02,Earth,,G/12/P,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Thewis Connelson,True,0.0
98,0105_01,Earth,,F/21/P,TRAPPIST-1e,27.0,False,0.0,0.0,570.0,2.0,131.0,Carry Cleachrand,False,703.0
104,0110_02,Europa,,B/5/P,TRAPPIST-1e,40.0,False,0.0,331.0,0.0,0.0,1687.0,Aldeba Bootious,False,2018.0
111,0115_01,Mars,,F/24/P,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0,Rohs Pead,True,0.0
152,0173_01,Earth,,E/11/S,TRAPPIST-1e,58.0,False,0.0,985.0,0.0,5.0,0.0,Hilip Grifford,True,990.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8620,9197_01,Europa,,C/308/P,55 Cancri e,44.0,False,0.0,0.0,0.0,0.0,0.0,Bellus Platch,True,0.0
8651,9227_05,Earth,,G/1498/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,Hard Hinglendez,False,0.0
8664,9246_01,Earth,,G/1490/S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
8675,9259_01,Earth,,F/1893/P,TRAPPIST-1e,44.0,False,1030.0,1015.0,0.0,11.0,0.0,Annah Gilleyons,True,2056.0


In [10]:
traindata[traindata.TotalSpend == 0].CryoSleep.describe()

count     3555
unique       2
top       True
freq      3037
Name: CryoSleep, dtype: object

* Total Spend seems a logical feature to use to fill CryoSleep. 
* We can see above that 3037 out of 3555 passengers with zero spend are in CryoSleep (85.4%). 
* For our imputation, if total spend is zero, we can assume that the passenger is in Cryosleep.


In [11]:
#FILLING MISSING CRYOSLEEP VALUES USING THIS METHOD
#traindata.CryoSleep.fillna(lambda x: 'True' if traindata.TotalSpend == 0 else 'False', inplace = True)
def fill_cryo(row):
    if type(row.CryoSleep) == float:
        if row.TotalSpend == 0:
            row.CryoSleep = True
        else:
            row.CryoSleep = False
    return row

traindata = traindata.apply(fill_cryo, axis=1)

In [12]:
traindata.CryoSleep[8620]

True

<br />
<br />

------


#### Examining the Age column we can see some interesting interactions with other features....

In [13]:
traindata[(traindata.Age < 18) & (traindata.VIP == True)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend


We can see that there are no VIPS under the age of 18.

In [14]:
traindata[(traindata.Age < 13) & (traindata.TotalSpend != 0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend


Anyone under the age of 13 has zero spend.

In [15]:
traindata[traindata.TotalSpend > 0].Age.describe()

count    4950.000000
mean       31.658182
std        12.591726
min        13.000000
25%        22.000000
50%        29.000000
75%        39.000000
max        79.000000
Name: Age, dtype: float64

In [16]:
traindata[traindata.TotalSpend == 0].Age.describe()

count    3564.000000
mean       24.897026
std        15.961601
min         0.000000
25%        14.000000
50%        24.000000
75%        35.000000
max        78.000000
Name: Age, dtype: float64

<br />
<br />

-----

#### Examining the Name column....

In [17]:
traindata[traindata.Name.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend
27,0022_01,Mars,False,D/0/P,TRAPPIST-1e,21.0,False,980.0,2.0,69.0,0.0,0.0,,False,1051.0
58,0064_01,Mars,True,F/14/S,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
65,0069_01,Earth,False,F/16/S,TRAPPIST-1e,42.0,False,887.0,0.0,9.0,6.0,0.0,,True,902.0
77,0082_03,Mars,False,F/16/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
101,0108_02,Earth,False,G/19/S,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,,False,888.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8629,9205_02,Europa,True,B/300/P,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
8631,9208_01,Earth,True,G/1485/S,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
8636,9218_01,Europa,True,B/353/S,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,,True,0.0
8652,9230_01,Europa,False,C/342/S,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,,True,10154.0


For now, the only method we can use to fill any meaningful data in the Name column is checking if there is anyone else travelling on the same ticket, and filling their surname to the missing entry. If there is nobody else travelling on the ticket, then a generic placeholder fill can be used.

In [18]:
#FIRSTLY, CREATING NEW FEATURES FROM CABIN COLUMN FOR THIS PURPOSE
traindata[['Ticket1','Ticket2']] = traindata['PassengerId'].str.split('_', expand = True)


In [19]:
#FOR THIS ITERATION, A SIMPLE PLACEHOLDER FILL IS USED

traindata.Name.fillna(value = 'Noname Noname', inplace = True)

<br />
<br />

-----

#### Now examining the HomePlanet and Destination features...

In [20]:
traindata[traindata.HomePlanet.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend,Ticket1,Ticket2
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,0.0,0.0,0.0,Colatz Keen,True,0.0,0064,02
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False,9307.0,0119,01
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,0.0,0.0,0.0,Arraid Inicont,True,0.0,0210,01
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,1288.0,0242,01
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,0.0,0251,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False,1299.0,9084,01
8613,9194_01,,False,E/603/S,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,Noname Noname,False,7177.0,9194,01
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True,2416.0,9248,01
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False,1148.0,9257,01


If there is anyone else travelling on the same ticket, then we can simply fill using their Home Planet. If not, then the overall mode can be used.


In [21]:
#FILLING USING THIS METHOD
homes = {}
def get_homes(row):
    if row.Ticket1 not in homes.keys() and type(row.HomePlanet) != float:
        homes.update({row.Ticket1: row.HomePlanet})
    return row
def fill_homes(row):
    if type(row.HomePlanet) == float:
        if homes.get(row.Ticket1) != None:
            row.HomePlanet = homes.get(row.Ticket1)
        else:
            row.HomePlanet = 'Earth'
    return row

traindata.apply(get_homes, axis=1)
traindata = traindata.apply(fill_homes, axis=1)
        
        

<br />
<br />

For destination, we will use a very similar method:

In [22]:
#USING THE SAME TICKET METHOD AS HOMEPLANET
dest = {}
def get_dest(row):
    if row.Ticket1 not in dest.keys() and type(row.Destination) != float:
        dest.update({row.Ticket1: row.Destination})
    return row
def fill_dest(row):
    if type(row.Destination) == float:
        if dest.get(row.Ticket1) != None:
            row.Destination = dest.get(row.Ticket1)
        else:
            row.Destination = 'TRAPPIST-1e'
    return row

traindata.apply(get_dest, axis=1)
traindata = traindata.apply(fill_dest, axis=1)
        

<br />
<br />

-----

#### Now to examine the Cabin feature...

In [23]:
traindata[traindata.Cabin.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend,Ticket1,Ticket2
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,908.0,0012,01
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0.0,0101,01
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,5109.0,0110,01
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,1048.0,0239,01
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0.0,0244,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,0.0,Naosura Motled,False,5066.0,8772,02
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,7397.0,9057,01
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,0.0,9069,03
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False,0.0,9081,03


Examining more of the data, we can see that some passengers travelling on the same ticket do not share the same cabin. This means a fill based on ticket would be unreliable, and so we will use a missing indicator fill here.

In [24]:
#FILLING ALL MISSING CABIN WITH PLACEHOLDER

traindata.Cabin.fillna('M/99999/M', inplace=True)

<br />
<br />

------

#### Examining Age...

In [25]:
traindata[traindata.Age.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend,Ticket1,Ticket2
50,0052_01,Earth,False,G/6/S,TRAPPIST-1e,,False,4.0,0.0,2.0,4683.0,0.0,Elaney Hubbarton,False,4689.0,0052,01
64,0068_01,Mars,False,E/4/S,TRAPPIST-1e,,False,793.0,0.0,2.0,253.0,0.0,Cinst Binie,False,1048.0,0068,01
137,0149_01,Earth,True,G/27/S,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Billya Hubbarrison,True,0.0,0149,01
181,0202_02,Europa,False,A/2/P,55 Cancri e,,False,0.0,2433.0,0.0,878.0,443.0,Vegas Embleng,True,3754.0,0202,02
184,0206_01,Europa,False,C/9/S,55 Cancri e,,False,2.0,1720.0,12.0,1125.0,122.0,Nuson Brugashed,True,2981.0,0206,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,8835_01,Earth,True,G/1425/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Shalle Bartines,False,0.0,8835,01
8301,8862_03,Europa,True,C/329/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Alchib Myling,True,0.0,8862,03
8374,8956_04,Earth,False,G/1453/P,TRAPPIST-1e,,False,194.0,1.0,10.0,629.0,0.0,Krisa Bonnondry,False,834.0,8956,04
8407,8988_01,Earth,True,G/1448/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Maen Fowlesterez,True,0.0,8988,01


Based on preliminary analysis, we know that total spend and home planet are the best predictors of age that we have available. Therefore, we will define a fill function based on these 2 features.

In [26]:
traindata[(traindata.TotalSpend == 0) & (traindata.HomePlanet == 'Earth')].Age.describe()

count    1748.000000
mean       20.243135
std        15.258574
min         0.000000
25%         8.000000
50%        18.000000
75%        29.000000
max        78.000000
Name: Age, dtype: float64

In [27]:
traindata[(traindata.TotalSpend > 0) & (traindata.HomePlanet == 'Europa')].Age.describe()

count    1103.000000
mean       36.513146
std        11.708791
min        13.000000
25%        28.000000
50%        34.000000
75%        43.000000
max        79.000000
Name: Age, dtype: float64

For example, manipulating these 2 features produces a median difference of 18 vs 34 here.

In [28]:
#CREATING FILL FUNCTION BASED ON TOTALSPEND, HOMEPLANET
def fill_age(row):
    if row.TotalSpend == 0:
        if row.HomePlanet == 'Earth':
            row.SubAge = 18
        elif row.HomePlanet == 'Europa':
            row.SubAge = 30
        else:
            row.SubAge = 25
    else:
        if row.HomePlanet == 'Earth':
            row.SubAge = 25
        elif row.HomePlanet == 'Europa':
            row.SubAge = 34
        else:
            row.SubAge = 30
    return row

In [29]:
traindata['SubAge'] = traindata.Age
traindata = traindata.apply(fill_age, axis=1)

In [30]:
traindata.Age.fillna(traindata.SubAge, inplace=True)
traindata.drop(columns = 'SubAge', inplace=True)

<br />
<br />
<br />

#### And finally, VIP...

In [31]:
traindata[(traindata.VIP == True) & (traindata.CryoSleep == False)].TotalSpend.describe()

count      178.000000
mean      4947.584270
std       5171.239613
min          0.000000
25%       1684.500000
50%       3142.000000
75%       6627.500000
max      31076.000000
Name: TotalSpend, dtype: float64

In [32]:
traindata[(traindata.VIP == False) & (traindata.CryoSleep == False)].TotalSpend.describe()

count     5256.000000
mean      2163.459855
std       3106.154435
min          0.000000
25%        739.000000
50%        995.500000
75%       2357.250000
max      35987.000000
Name: TotalSpend, dtype: float64

There is a clear difference in average spend based on VIP status. Using this, we can fill missing values in the VIP column based on a selected spend figure of 2500 (this figure can be tuned in testing).

In [33]:
#traindata.VIP.fillna(lambda x: 'True' if (traindata.TotalSpend>2500 and traindata.Age>17) else 'False', inplace=True)
def fill_VIP(row):
    if row.TotalSpend>2500 and row.Age>17:
        row.VIP = 'True'
    else:
        row.VIP = 'False'
    return row

traindata = traindata.apply(fill_VIP, axis=1)

In [34]:
traindata.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
TotalSpend      0
Ticket1         0
Ticket2         0
dtype: int64

All missing values are now filled.

#### Now all values can be filled in the test set using the same methods.

In [35]:
#Filling all testset missing values using the same principles
testdata.RoomService.fillna(0, inplace=True)
testdata.FoodCourt.fillna(0, inplace=True)
testdata.ShoppingMall.fillna(0, inplace=True)
testdata.Spa.fillna(0, inplace=True)
testdata.VRDeck.fillna(0, inplace=True)
testdata['TotalSpend'] = testdata['FoodCourt'] + testdata['RoomService'] + testdata['Spa'] + testdata['VRDeck'] + testdata['ShoppingMall']
testdata = testdata.apply(fill_cryo, axis=1)
testdata[['Ticket1','Ticket2']] = testdata['PassengerId'].str.split('_', expand = True)
testdata.Name.fillna(value = 'Noname Noname', inplace = True)
testdata.apply(get_homes, axis=1)
testdata = testdata.apply(fill_homes, axis=1)
testdata.apply(get_dest, axis=1)
testdata = testdata.apply(fill_dest, axis=1)
testdata.Cabin.fillna('M/99999/M', inplace=True)
testdata['SubAge'] = testdata.Age
testdata = testdata.apply(fill_age, axis=1)
testdata.Age.fillna(testdata.SubAge, inplace=True)
testdata.drop(columns = 'SubAge', inplace=True)
testdata = testdata.apply(fill_VIP, axis=1)

In [36]:
testdata.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
TotalSpend      0
Ticket1         0
Ticket2         0
dtype: int64

<br />
<br />
<br />
<br />
 
 ------
 ------

# FEATURE ENGINEERING

To start, the Cabin feature is unusable in its current format. We will split this into 3 new features: CabinDeck, CabinNumber and CabinSide.

In [37]:
#CREATING THE 3 NEW COLUMNS
traindata[['CabinDeck','CabinNumber','CabinSide']] = traindata['Cabin'].str.split('/', expand=True)
traindata.CabinNumber = traindata.CabinNumber.astype(int)
traindata.CabinNumber = pd.qcut(traindata.CabinNumber, 15)

<br />
<br />

-----

Next, Using the Ticket1 and 2 columns, we can create a GroupSize feature, based on the number of people travelling on the same ticket.

In [38]:
traindata['GroupSize'] = 0
groups = {}
def get_groups(row):
    if row.Ticket1 not in groups:
        groups.update({row.Ticket1:row.Ticket2})
    elif row.Ticket2 > groups.get(row.Ticket1):
        groups.update({row.Ticket1:row.Ticket2})
    return row
def give_groups(row):
    row.GroupSize = groups.get(row.Ticket1)
    return row

In [39]:
traindata.apply(get_groups, axis=1)
traindata = traindata.apply(give_groups, axis=1)

<br />
<br />

-----

As we know from earlier analysis that some age groups are more likely to be transported than others, we can create some age group markers as features.

In [40]:
traindata['Is_Child'] = False
traindata['Is_Teenager'] = False
traindata['Is_Adult'] = False


def age_markers(row):
    if row.Age<13:
        row.Is_Child = True
    elif row.Age<18:
        row.Is_Teenager = True
    else:
        row.Is_Adult = True
    return row

traindata = traindata.apply(age_markers, axis=1)

<br />
<br />

----

Also from earlier analysis, we know that exact trip (combination of HomePlanet and Destination) has a large effect on probability of being transported. 

In [41]:
traindata['Trip'] = 0
def give_trip(row):
    if row.HomePlanet == 'Earth':
        if row.Destination == 'PSO J318.5-22':
            row.Trip = 1
        elif row.Destination == '55 Cancri e':
            row.Trip = 2
    elif row.HomePlanet == 'Europa':
        if row.Destination == 'TRAPPIST-1e':
            row.Trip = 3
        elif row.Destination == 'PSO J318.5-22':
            row.Trip = 4
        else:
            row.Trip = 5
    else:
        if row.Destination == 'TRAPPIST-1e':
            row.Trip = 6
        elif row.Destination == 'PSO J318.5-22':
            row.Trip = 7
        else:
            row.Trip = 8
    return row
traindata = traindata.apply(give_trip, axis=1)
traindata.Trip = traindata.Trip.astype('category')            

In [42]:
traindata.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Ticket1,Ticket2,CabinDeck,CabinNumber,CabinSide,GroupSize,Is_Child,Is_Teenager,Is_Adult,Trip
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,1,1,B,"(-0.001, 44.0]",P,1,False,False,True,3
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,2,1,F,"(-0.001, 44.0]",S,1,False,False,True,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,3,1,A,"(-0.001, 44.0]",S,2,False,False,True,3
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,True,0.0,1283.0,371.0,...,3,2,A,"(-0.001, 44.0]",S,2,False,False,True,3
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,4,1,F,"(-0.001, 44.0]",S,1,False,True,False,0
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,...,5,1,F,"(-0.001, 44.0]",P,1,False,False,True,1
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,...,6,1,F,"(-0.001, 44.0]",S,2,False,False,True,0
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,...,6,2,G,"(-0.001, 44.0]",S,2,False,False,True,0
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,...,7,1,F,"(-0.001, 44.0]",S,1,False,False,True,0
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,...,8,1,B,"(-0.001, 44.0]",P,3,False,True,False,5


In [43]:
traindata.CabinNumber.dtype

interval[float64, right]

<br />
<br />

----

#### Now to create all of the same features in the test dataset.

In [44]:
testdata[['CabinDeck','CabinNumber','CabinSide']] = testdata['Cabin'].str.split('/', expand=True)
testdata.CabinNumber = testdata.CabinNumber.astype(int)
testdata['GroupSize'] = 0
testdata.apply(get_groups, axis=1)
testdata = testdata.apply(give_groups, axis=1)
testdata['Is_Child'] = False
testdata['Is_Teenager'] = False
testdata['Is_Adult'] = False
testdata = testdata.apply(age_markers, axis=1)
testdata['Trip'] = 0
testdata = testdata.apply(give_trip, axis=1)
testdata.Trip = testdata.Trip.astype('category')
testdata.CabinNumber = pd.qcut(testdata.CabinNumber, 15)

<br />
<br />
<br />
<br />

----
----

# MODELLING

 #### First, we will discard some columns which will not be used for modelling.

##### Some notes on this: 
* PassengerId is an index column
* Cabin is unusable in its original state but has been transformed to other features
* Name is unusable in its original state
* Ticket1 and Ticket2 are a derivation of PassengerId that were used to engineer other features
* 

In [45]:
traindata_final = traindata.drop(columns = ['PassengerId','Cabin','Name','Ticket1','Ticket2','TotalSpend','CabinNumber'])
testdata_final = testdata.drop(columns = ['PassengerId','Cabin','Name','Ticket1','Ticket2','TotalSpend','CabinNumber'])
numerical_cols = [col for col in traindata_final.columns if traindata_final[col].dtype not in ['object','bool','category']]
categorical_cols = [col for col in traindata_final.columns if traindata_final[col].dtype in ['object','bool','category','interval']]
categorical_cols.remove('Transported')
X = traindata_final.drop(columns = ['Transported'])
y = traindata_final.Transported

<br />
<br />

#### Then, creating a Train-Test split.

In [46]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X), y, test_size=0.2, random_state=42)


<br />
<br />

----

#### To start the modelling process, We will trial _ models in the first instance and compare performance.

<br />
<br />

In [47]:
#FIRST MODEL - XGBOOST
model1 = XGBClassifier(n_estimators = 4000, 
                       learning_rate=0.0035, 
                       max_depth=3, 
                       subsample=0.5564641148999913, 
                       colsample_bytree=0.5833760248257248, 
                       reg_alpha=1, 
                       reg_lambda=2, 
                       min_split_loss=1, 
                       min_child_weight=2)

XGParams = {'n_estimators': 607, 'learning_rate': 0.08627114961011287, 'max_depth': 3, 'subsample': 0.5564641148999913, 'colsample_bytree': 0.5833760248257248}
modelXG = XGBClassifier(**XGParams)

XG2Params = {'n_estimators': 600,
 'learning_rate': 0.012722652199549763,
 'max_depth': 4,
 'subsample': 0.7330910239008844,
 'colsample_bytree': 0.7660345330986351}

XG3Params = {'n_estimators': 620,
 'learning_rate': 0.01123731443293645,
 'max_depth': 4,
 'subsample': 0.6885307223746568,
 'colsample_bytree': 0.7511271681266807}

modelXG2 = XGBClassifier(**XG2Params)

modelXG3 = XGBClassifier(**XG3Params)

In [48]:
#CREATING PIPELINE
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

my_pipeline1 = Pipeline(steps = [('preprocessor', preprocessor), ('model', modelXG3)])

my_pipeline1.fit(X, y)

In [49]:
#CROSS VALIDATION SCORE
scores = cross_val_score(my_pipeline1, X, y, cv=5, scoring='accuracy')

print("Accuracy scores:\n", scores)

print("Average Accuracy score (across experiments):")
print(scores.mean())

Accuracy scores:
 [0.78550891 0.79815986 0.80161012 0.8164557  0.80667434]
Average Accuracy score (across experiments):
0.8016817860879266


In [50]:
#OPTIMISING HYPERPARAMETERS
def objectiveXG(trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 4000),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'subsample' : trial.suggest_uniform('subsample', 0.5, 0.9),
        'colsample_bytree' : trial.suggest_uniform('colsample_bytree', 0.5, 0.9)
    }
    
    model = XGBClassifier(**params)
    
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_valid)
    
    score = accuracy_score(y_valid, train_pred)
    
    return score

#studyXG = optuna.create_study(direction='maximize')
#studyXG.optimize(objectiveXG)
#studyXG.best_params

In [51]:
#OPTIMISING BY KFOLD CV
def objectiveXG2(trial):
        X_local = pd.get_dummies(X)
        
        params = {
        'n_estimators' : trial.suggest_int('n_estimators', 550, 650,10),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.02),
        'max_depth' : trial.suggest_int('max_depth', 3, 5),
        'subsample' : trial.suggest_uniform('subsample', 0.6, 0.8),
        'colsample_bytree' : trial.suggest_uniform('colsample_bytree', 0.6, 0.8)
    }
        
        model = XGBClassifier(**params)
        
        skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
        
        for train_index,test_index in skf.split(X_local,y):
            scores = []
            X_train, y_train = X_local.iloc[train_index], y.iloc[train_index]
            X_val, y_val = X_local.iloc[test_index], y.iloc[test_index]
        
            model.fit(X_train, y_train)
        
            pred = model.predict(X_val)
        
            score = accuracy_score(y_val, pred)
            scores.append(score)
        
            
        return (sum(scores)/len(scores))

#studyXG2 = optuna.create_study(direction='maximize')
#studyXG2.optimize(objectiveXG2, n_trials=100)

In [52]:
#SECOND MODEL - CATBOOST
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


cat_feature_indices = [X.columns.get_loc(col) for col in categorical_cols]

params2 = {'cat_features': cat_feature_indices,
           'eval_metric' : 'Accuracy',
           'depth' : 4
          }

model2 = CatBoostClassifier(**params2)


In [53]:
# CROSS VALIDATION

model2.fit(X_train, y_train,
          eval_set=(X_valid, y_valid),
          use_best_model=True,
          plot=True
         )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.051161
0:	learn: 0.7424504	test: 0.7441058	best: 0.7441058 (0)	total: 65.6ms	remaining: 1m 5s
1:	learn: 0.7497843	test: 0.7372053	best: 0.7441058 (0)	total: 74.9ms	remaining: 37.4s
2:	learn: 0.7451826	test: 0.7400805	best: 0.7441058 (0)	total: 83.2ms	remaining: 27.7s
3:	learn: 0.7453264	test: 0.7429557	best: 0.7441058 (0)	total: 91.5ms	remaining: 22.8s
4:	learn: 0.7592752	test: 0.7464060	best: 0.7464060 (4)	total: 99.9ms	remaining: 19.9s
5:	learn: 0.7582686	test: 0.7469810	best: 0.7469810 (5)	total: 108ms	remaining: 17.8s
6:	learn: 0.7575496	test: 0.7458309	best: 0.7469810 (5)	total: 117ms	remaining: 16.5s
7:	learn: 0.7576934	test: 0.7492812	best: 0.7492812 (7)	total: 124ms	remaining: 15.4s
8:	learn: 0.7575496	test: 0.7492812	best: 0.7492812 (7)	total: 133ms	remaining: 14.6s
9:	learn: 0.7634455	test: 0.7533065	best: 0.7533065 (9)	total: 141ms	remaining: 13.9s
10:	learn: 0.7653149	test: 0.7653824	best: 0.7653824 (10)	total: 150ms	remaining: 13.4s
11:	learn: 0.7748

<catboost.core.CatBoostClassifier at 0x7c078979fdc0>

In [54]:
#TUNING HYPERPARAMETERS

def objectiveCat(trial):
    params = {
        'iterations': 5000,
        'eval_metric':'accuracy',
        'depth': trial.suggest_int('depth',3,7),
    
    }

In [55]:
pred1 = my_pipeline1.predict(testdata_final)
pred2 = model2.predict(testdata_final)

In [56]:
#OUTPUTS PREDICTIONS TO CSV#

output = pd.DataFrame({"PassengerId":testdata.PassengerId,"Transported":pred2.astype('bool')})
output.to_csv('submission.csv', index=False)

______

____________

In [57]:
prob1 = my_pipeline1.predict_proba(testdata_final)
prob2 = model2.predict_proba(testdata_final)

In [58]:
prob1 = [i[0] for i in prob1]
prob2 = [i[0] for i in prob2]

In [59]:
probframe = pd.DataFrame({'PassengerId':testdata.PassengerId,'XGProb':prob1,'CatProb':prob2})

In [60]:
probframe[abs(probframe.XGProb-probframe.CatProb)>0.02]

Unnamed: 0,PassengerId,XGProb,CatProb
1,0018_01,0.959333,0.985868
4,0023_01,0.405765,0.339835
5,0027_01,0.469537,0.538428
9,0033_01,0.374471,0.308009
10,0037_01,0.714378,0.758555
...,...,...,...
4271,9266_01,0.334519,0.255888
4273,9269_01,0.601099,0.541049
4274,9271_01,0.078969,0.037612
4275,9273_01,0.308545,0.264264


In [61]:
probframe[((probframe.XGProb > 0.5) & (probframe.CatProb < 0.5)) | ((probframe.XGProb < 0.5) & (probframe.CatProb > 0.5))]

Unnamed: 0,PassengerId,XGProb,CatProb
5,0027_01,0.469537,0.538428
24,0054_03,0.464375,0.796928
48,0118_01,0.505332,0.495844
79,0175_03,0.439240,0.543580
80,0175_04,0.479834,0.547951
...,...,...,...
4045,8800_01,0.439240,0.543580
4047,8800_03,0.450585,0.642902
4049,8800_05,0.478431,0.514204
4133,8979_01,0.478979,0.513903


In [62]:
probframe['AvgProb'] = (probframe.XGProb + probframe.CatProb) / 2

In [63]:
def Decision(row):
    if row.AvgProb > 0.5:
        row.AvgDecision = False
    else:
        row.AvgDecision = True
    
    return row

In [64]:
#probframe = probframe.apply(Decision, axis=1)

#probframe[probframe.AvgDecision != pred2]

In [65]:
#output2 = pd.DataFrame({"PassengerId":testdata.PassengerId,"Transported":probframe.AvgDecision.astype('bool')})
#output2.to_csv('submission2.csv', index=False)