See [Spaceship Titanic - Logistic Regression Baselines](https://www.kaggle.com/code/defcodeking/spaceship-titanic-logistic-regression-baselines) for more details.

# Imports

In [40]:
from src.feature_engineering import FeatureEngineer
from src.imputer import CategoricalImputer, numeric_imputer
from src.encoder import encode_features
import pandas as pd
import os

# Config

In [41]:
DATA_DIR = "./data"

In [42]:
def filepath(filename):
    return os.path.join(DATA_DIR, filename)

# Load Datasets

In [43]:
train_df = pd.read_csv(filepath("train.csv"), index_col="PassengerId")
test_df = pd.read_csv(filepath("test.csv"), index_col="PassengerId")

train_df["PassengerId"] = train_df.index
test_df["PassengerId"] = test_df.index

len(train_df), len(test_df)

# Feature Engineering

In [44]:
engineer = FeatureEngineer(
    missing_value_cols=["RoomService", "FoodCourt", "ShoppingMall", "Cabin", "VIP"]
)

# Exclude Cabin since all features from Cabin
# Will be engineered after missing values are imputed
train_df = engineer(train_df, exclude="cabin")
test_df = engineer(test_df, exclude="cabin")

In [45]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,1,1,True,False,False,False,False,False,False,0.0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,2,1,True,False,False,False,False,False,False,736.0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,3,2,False,False,False,False,False,False,False,10383.0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,3,2,False,False,False,False,False,False,False,5176.0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,4,1,True,False,False,False,False,False,False,1091.0


In [46]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,13,1,True,False,False,False,False,False,False,0.0
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,18,1,True,False,False,False,False,False,False,2832.0
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,19,1,True,False,False,False,False,False,False,0.0
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,21,1,True,False,False,False,False,False,False,7418.0
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,23,1,True,False,False,False,False,False,False,645.0


# Impute Categorical Missing Values

In [47]:
categorical_imputer = CategoricalImputer(feature_mode_cols=["HomePlanet", "CryoSleep", "Destination"])

train_df = categorical_imputer(train_df)
test_df = categorical_imputer(test_df)

In [48]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,1,1,True,False,False,False,False,False,False,0.0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,2,1,True,False,False,False,False,False,False,736.0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,3,2,False,False,False,False,False,False,False,10383.0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,3,2,False,False,False,False,False,False,False,5176.0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,4,1,True,False,False,False,False,False,False,1091.0


In [49]:
train_df.isna().any()

In [50]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,13,1,True,False,False,False,False,False,False,0.0
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,18,1,True,False,False,False,False,False,False,2832.0
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,19,1,True,False,False,False,False,False,False,0.0
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,21,1,True,False,False,False,False,False,False,7418.0
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,23,1,True,False,False,False,False,False,False,645.0


# Engineer Features From `Cabin`

In [51]:
exclude = set(engineer.supported_features) - {"cabin"}
train_df = engineer(train_df, exclude=exclude)
test_df = engineer(test_df, exclude=exclude)

In [52]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,B,0,P
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,False,False,False,False,False,False,736.0,F,0,S
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,False,False,False,False,False,False,10383.0,A,0,S
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,False,False,False,False,False,False,5176.0,A,0,S
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,False,False,False,False,False,False,1091.0,F,1,S


In [53]:
train_df.isna().any()

In [54]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,G,3,S
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,False,False,False,False,False,False,2832.0,F,4,S
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,C,0,S
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,False,False,False,False,False,False,7418.0,C,1,S
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,False,False,False,False,False,False,645.0,F,5,S


# Feature Encoding

In [55]:
one_hot_cols = ["HomePlanet", "Destination", "GroupSize", "CabinDeck"]

train_df, test_df = encode_features(
    train_df=train_df,
    test_df=test_df,
    one_hot_cols=one_hot_cols,
    label_encoding_cols=["CabinSide"],
)

In [56]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,B/0/P,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0,0,1,0,0,0,0,0,0,False
0002_01,0,F/0/S,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0,0,0,0,0,0,1,0,0,True
0003_01,0,A/0/S,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0,1,0,0,0,0,0,0,0,False
0003_02,0,A/0/S,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0,1,0,0,0,0,0,0,0,False
0004_01,0,F/1/S,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0,0,0,0,0,0,1,0,0,True


In [57]:
test_df.head()

Unnamed: 0_level_0,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,GroupSize_7,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1,G/3/S,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,...,0,0,0,0,0,0,0,0,1,0
0018_01,0,F/4/S,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,...,0,0,0,0,0,0,0,1,0,0
0019_01,1,C/0/S,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,...,0,0,0,0,1,0,0,0,0,0
0021_01,0,C/1/S,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,...,0,0,0,0,1,0,0,0,0,0
0023_01,0,F/5/S,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,...,0,0,0,0,0,0,0,1,0,0


# Drop Unnecessary Features

In [58]:
drop = ["PassengerId", "Cabin", "Name", "CabinNum", "GroupId"]
train_df = train_df.drop(drop, axis=1)
test_df = test_df.drop(drop, axis=1)

# Impute Numeric Missing Values

In [59]:
numeric_cols = [
    "Age",
    "TotalExpense",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
]

train_df = numeric_imputer(df=train_df, numeric_cols=numeric_cols)
test_df = numeric_imputer(df=test_df, numeric_cols=numeric_cols, has_labels=False)

In [60]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [61]:
test_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_7,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
