In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier


Read in Data

In [48]:
train = pd.read_csv("spaceship-titanic/train.csv")
test = pd.read_csv("spaceship-titanic/test.csv")

In [39]:
train


Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Earth,...,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_unknown,VIP_False,VIP_True,VIP_unknown,Cabin_P,Cabin_S,Cabin_unknown
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,False,...,False,False,True,False,True,False,False,True,False,False
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,True,...,False,False,True,False,True,False,False,False,True,False
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,False,...,False,False,True,False,False,True,False,False,True,False
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,False,...,False,False,True,False,True,False,False,False,True,False
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,True,...,False,False,True,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,41.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,False,...,True,False,False,False,False,True,False,True,False,False
8689,9278_01,18.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,True,...,False,True,False,False,True,False,False,False,True,False
8690,9279_01,26.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,True,...,False,False,True,False,True,False,False,False,True,False
8691,9280_01,32.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,False,...,True,False,False,False,True,False,False,False,True,False


In [15]:
missing_values = train[train.isna().any(axis=1)]
missing_values

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False
16,0014_01,Mars,False,F/3/P,55 Cancri e,27.0,False,1286.0,122.0,,0.0,0.0,Flats Eccle,False
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,0.0,0.0,Mollen Mcfaddennon,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,9250_01,Europa,False,E/597/P,TRAPPIST-1e,29.0,False,0.0,2972.0,,28.0,188.0,Chain Reedectied,True
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False
8675,9259_01,Earth,,F/1893/P,TRAPPIST-1e,44.0,False,1030.0,1015.0,0.0,11.0,,Annah Gilleyons,True
8684,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True


Handle missing values 

In [49]:
# Impute numerical values with median

median_RoomService = train["RoomService"].median()
train.fillna({"RoomService": median_RoomService}, inplace = True)

median_FoodCourt = train["FoodCourt"].median()
train.fillna({"FoodCourt": median_FoodCourt}, inplace = True)

median_ShoppingMall = train["ShoppingMall"].median()
train.fillna({"ShoppingMall": median_ShoppingMall}, inplace = True)

median_Spa = train["Spa"].median()
train.fillna({"Spa": median_Spa}, inplace = True)

median_VRDeck = train["VRDeck"].median()
train.fillna({"VRDeck": median_VRDeck}, inplace = True)

median_Age = train["Age"].median()
train.fillna({"Age": median_Age}, inplace = True)

# Add new category "Missing" for every categorical variable

train.fillna({"HomePlanet": "unknown"}, inplace=True)
train.fillna({"CryoSleep": "unknown"}, inplace=True)
train.fillna({"Destination": "unknown"}, inplace=True)
train.fillna({"VIP": "unknown"}, inplace=True)

# Filter Cabin for either S or P to know on which side of the ship their cabin was -> maybe impact on death

train["Cabin"] = train["Cabin"].str.replace("/", "")

def set_value(x):
    x_str = str(x)
    if 'P' in x_str:
        return 'P'
    elif 'S' in x_str:
        return 'S'
    else:
        return x
    
train['Cabin'] = train['Cabin'].apply(set_value)
train.fillna({"Cabin": "unknown"}, inplace=True)

# Dummies for Categorical variables
dummie_v = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin"]
train = pd.get_dummies(train, columns=dummie_v, drop_first=True)
train = train.drop(columns = ["Name", 'PassengerId'])

Train basic model without feature engineering

In [50]:
y = train["Transported"]
X = train.drop("Transported")


In [51]:
train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_unknown,CryoSleep_True,CryoSleep_unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_unknown,VIP_True,VIP_unknown,Cabin_S,Cabin_unknown
0,39.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,False,True,False,False,False,False,False
1,24.0,109.0,9.0,25.0,549.0,44.0,True,False,False,False,False,False,False,True,False,False,False,True,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,True,False,False,False,False,False,True,False,True,False,True,False
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,True,False,False,False,False,False,True,False,False,False,True,False
4,16.0,303.0,70.0,151.0,565.0,2.0,True,False,False,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,False,True,False,False,False,False,False,False,False,True,False,False,False
8689,18.0,0.0,0.0,0.0,0.0,0.0,False,False,False,False,True,False,True,False,False,False,False,True,False
8690,26.0,0.0,0.0,1872.0,1.0,0.0,True,False,False,False,False,False,False,True,False,False,False,True,False
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,False,True,False,False,False,False,False,False,False,False,False,True,False
