In [1]:
import re
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

In [2]:
BASE_PATH = "/kaggle/input/spaceship-titanic/"
TRAIN_CSV_PATH = BASE_PATH + "train.csv"
TEST_CSV_PATH = BASE_PATH + "test.csv"

In [3]:
train_df = pd.read_csv(TRAIN_CSV_PATH)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
unique_series = train_df.nunique().rename('unique_count')
na_series = train_df.isna().sum().rename('na_count')
type_series = train_df.dtypes.rename('type')

train_info_df = pd.concat([unique_series, na_series, type_series], axis=1)
train_info_df

Unnamed: 0,unique_count,na_count,type
PassengerId,8693,0,object
HomePlanet,3,201,object
CryoSleep,2,217,object
Cabin,6560,199,object
Destination,3,182,object
Age,80,179,float64
VIP,2,203,object
RoomService,1273,181,float64
FoodCourt,1507,183,float64
ShoppingMall,1115,208,float64


In [5]:
bool_features = ["CryoSleep", "VIP"]

train_df["Transported"] = train_df["Transported"].astype(int)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


In [6]:
cabin_df = train_df["Cabin"].str.split("/", expand=True).rename(columns={
    0: "CabinDeck",
    1: "CabinNumber",
    2: "CabinSide"
})

train_df = pd.concat([train_df.drop(["Cabin"], axis=1), cabin_df], axis=1)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,CabinDeck,CabinNumber,CabinSide
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S


In [7]:
encode_features = ["HomePlanet", "Destination", *bool_features]

encoded_features_df = pd.get_dummies(train_df[encode_features].replace(" ", "-", regex=True), prefix_sep="")

train_df = pd.concat([train_df.drop(encode_features, axis=1), encoded_features_df], axis=1)
train_df.head(5)

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,CabinDeck,...,HomePlanetEarth,HomePlanetEuropa,HomePlanetMars,Destination55-Cancri-e,DestinationPSO-J318.5-22,DestinationTRAPPIST-1e,CryoSleepFalse,CryoSleepTrue,VIPFalse,VIPTrue
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,...,0,1,0,0,0,1,1,0,1,0
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,...,1,0,0,0,0,1,1,0,1,0
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,...,0,1,0,0,0,1,1,0,0,1
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,...,0,1,0,0,0,1,1,0,1,0
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,...,1,0,0,0,0,1,1,0,1,0


In [8]:
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
train_df[numeric_features] = train_df[numeric_features].apply(pd.to_numeric, errors="coerce")
train_df.isna().sum()

PassengerId                   0
Age                         179
RoomService                 181
FoodCourt                   183
ShoppingMall                208
Spa                         183
VRDeck                      188
Name                        200
Transported                   0
CabinDeck                   199
CabinNumber                 199
CabinSide                   199
HomePlanetEarth               0
HomePlanetEuropa              0
HomePlanetMars                0
Destination55-Cancri-e        0
DestinationPSO-J318.5-22      0
DestinationTRAPPIST-1e        0
CryoSleepFalse                0
CryoSleepTrue                 0
VIPFalse                      0
VIPTrue                       0
dtype: int64

In [9]:
px.imshow(train_df.corr())

In [10]:
train_df.corr()['Transported'].abs().sort_values().rename("TransportedCorr").to_frame().drop(["Transported"])

Unnamed: 0,TransportedCorr
DestinationPSO-J318.5-22,9.2e-05
ShoppingMall,0.010141
HomePlanetMars,0.019544
VIPFalse,0.024602
VIPTrue,0.037261
FoodCourt,0.046566
Age,0.075026
DestinationTRAPPIST-1e,0.0947
Destination55-Cancri-e,0.108722
HomePlanetEarth,0.169019
