See [Spaceship Titanic - Logistic Regression Baselines](https://www.kaggle.com/code/defcodeking/spaceship-titanic-logistic-regression-baselines) for more details.

# Imports

In [1]:
import random
import os

import numpy as np
import pandas as pd
from sklearn import linear_model, metrics

from src import make_folds
from src.encoder import encode_features
from src.feature_engineering import FeatureEngineer
from src.imputer import CategoricalImputer, numeric_imputer


In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# Config

In [3]:
DATA_DIR = "./data"

In [4]:
def filepath(filename):
    return os.path.join(DATA_DIR, filename)

# Load Datasets

In [5]:
train_df = pd.read_csv(filepath("train.csv"), index_col="PassengerId")
test_df = pd.read_csv(filepath("test.csv"), index_col="PassengerId")

train_df["PassengerId"] = train_df.index
test_df["PassengerId"] = test_df.index

len(train_df), len(test_df)

# Feature Engineering

In [6]:
engineer = FeatureEngineer(
    missing_value_cols=["RoomService", "FoodCourt", "ShoppingMall", "Cabin", "VIP"]
)

# Exclude Cabin since all features from Cabin
# Will be engineered after missing values are imputed
train_df = engineer(train_df, exclude="cabin")
test_df = engineer(test_df, exclude="cabin")

In [7]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,1,1,True,False,False,False,False,False,False,0.0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,2,1,True,False,False,False,False,False,False,736.0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,3,2,False,False,False,False,False,False,False,10383.0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,3,2,False,False,False,False,False,False,False,5176.0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,4,1,True,False,False,False,False,False,False,1091.0


In [8]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,13,1,True,False,False,False,False,False,False,0.0
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,18,1,True,False,False,False,False,False,False,2832.0
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,19,1,True,False,False,False,False,False,False,0.0
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,21,1,True,False,False,False,False,False,False,7418.0
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,23,1,True,False,False,False,False,False,False,645.0


# Impute Categorical Missing Values

In [9]:
categorical_imputer = CategoricalImputer(feature_mode_cols=["HomePlanet", "CryoSleep", "Destination"])

train_df = categorical_imputer(train_df)
test_df = categorical_imputer(test_df)

In [10]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,1,1,True,False,False,False,False,False,False,0.0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,2,1,True,False,False,False,False,False,False,736.0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,3,2,False,False,False,False,False,False,False,10383.0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,3,2,False,False,False,False,False,False,False,5176.0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,4,1,True,False,False,False,False,False,False,1091.0


In [11]:
train_df.isna().any()

In [12]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,13,1,True,False,False,False,False,False,False,0.0
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,18,1,True,False,False,False,False,False,False,2832.0
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,19,1,True,False,False,False,False,False,False,0.0
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,21,1,True,False,False,False,False,False,False,7418.0
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,23,1,True,False,False,False,False,False,False,645.0


# Engineer Features From `Cabin`

In [13]:
# Exclude everything except Cabin
exclude = set(engineer.supported_features) - {"cabin"}
train_df = engineer(train_df, exclude=exclude)
test_df = engineer(test_df, exclude=exclude)

In [14]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,B,0,P
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,False,False,False,False,False,False,736.0,F,0,S
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,False,False,False,False,False,False,10383.0,A,0,S
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,False,False,False,False,False,False,5176.0,A,0,S
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,False,False,False,False,False,False,1091.0,F,1,S


In [15]:
train_df.isna().any()

In [16]:
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,G,3,S
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,...,False,False,False,False,False,False,2832.0,F,4,S
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,C,0,S
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,...,False,False,False,False,False,False,7418.0,C,1,S
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,...,False,False,False,False,False,False,645.0,F,5,S


# Feature Encoding

In [17]:
one_hot_cols = ["HomePlanet", "Destination", "GroupSize", "CabinDeck"]

train_df, test_df = encode_features(
    train_df=train_df,
    test_df=test_df,
    one_hot_cols=one_hot_cols,
    label_encoding_cols=["CabinSide"],
)

In [18]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,B/0/P,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0,0,1,0,0,0,0,0,0,False
0002_01,0,F/0/S,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0,0,0,0,0,0,1,0,0,True
0003_01,0,A/0/S,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0,1,0,0,0,0,0,0,0,False
0003_02,0,A/0/S,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0,1,0,0,0,0,0,0,0,False
0004_01,0,F/1/S,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0,0,0,0,0,0,1,0,0,True


In [19]:
test_df.head()

Unnamed: 0_level_0,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,GroupSize_7,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1,G/3/S,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,...,0,0,0,0,0,0,0,0,1,0
0018_01,0,F/4/S,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,...,0,0,0,0,0,0,0,1,0,0
0019_01,1,C/0/S,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,...,0,0,0,0,1,0,0,0,0,0
0021_01,0,C/1/S,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,...,0,0,0,0,1,0,0,0,0,0
0023_01,0,F/5/S,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,...,0,0,0,0,0,0,0,1,0,0


# Drop Unnecessary Features

In [20]:
drop = ["PassengerId", "Cabin", "Name", "CabinNum", "GroupId"]

train_cabinnum = train_df["CabinNum"]
test_cabinnum = test_df["CabinNum"]

train_groupid = train_df["GroupId"]
test_groupid = test_df["GroupId"]


train_df = train_df.drop(drop, axis=1)
test_df = test_df.drop(drop, axis=1)

# Impute Numeric Missing Values

In [21]:
numeric_cols = [
    "Age",
    "TotalExpense",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
]

train_df = numeric_imputer(df=train_df, numeric_cols=numeric_cols)
test_df = numeric_imputer(df=test_df, numeric_cols=numeric_cols, has_labels=False)

In [22]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [23]:
test_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_7,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Create Folds

In [24]:
train_df = make_folds(df=train_df, n_folds=5)
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4


# Logistic Regression

## Training Loop

In [25]:
def train(df):
    df["preds"] = pd.NA
    drop = ["Transported", "preds", "kfold"]
    
    for fold in range(5):
        train = df[df["kfold"] != fold]
        
        y_train = train["Transported"].values
        X_train = train.drop(drop, axis=1).values
        
        val = df[df["kfold"] == fold]
        
        y_val = val["Transported"].values
        X_val = val.drop(drop, axis=1).values
        
        model = linear_model.LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        
        preds = model.predict(X_val)
        df.loc[val.index, "preds"] = preds
        
        acc = metrics.accuracy_score(y_val, preds)
        print(f"Fold {fold + 1} - Accuracy = {acc: .4f}")
    
    df[drop] = df[drop].astype(int)
    
    acc = metrics.accuracy_score(df["Transported"].values, df["preds"].values)
    print(f"Overall accuracy = {acc: .4f}")
    
    return df

## Experiment 1: Without `CabinNum` and `GroupId`

In [26]:
train_df_exp1 = train(train_df.copy())

Fold 1 - Accuracy =  0.7867
Fold 2 - Accuracy =  0.7861
Fold 3 - Accuracy =  0.7936
Fold 4 - Accuracy =  0.7975
Fold 5 - Accuracy =  0.7969
Overall accuracy =  0.7921


## Experiment 2: With `CabinNum` One-Hot Encoded

In [27]:
# Make copies so that the original dataframes do not change
train_df_with_cabinnum_onehot = train_df.copy()
test_df_with_cabinnum_onehot = test_df.copy()


train_df_with_cabinnum_onehot["CabinNum"] = train_cabinnum
test_df_with_cabinnum_onehot["CabinNum"] = test_cabinnum

train_df_with_cabinnum_onehot, test_df_with_cabinnum_onehot = encode_features(
    train_df=train_df_with_cabinnum_onehot,
    test_df=test_df_with_cabinnum_onehot,
    bool_exclude=test_df.columns,
    one_hot_cols=["CabinNum"],
    drop_and_restore=["kfold"],
)

In [28]:
train_df_with_cabinnum_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinNum_992,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinNum_998,CabinNum_999,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,4


In [29]:
test_df_with_cabinnum_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinNum_990,CabinNum_991,CabinNum_992,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinNum_998,CabinNum_999
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
train_df_exp2 = train(train_df_with_cabinnum_onehot.copy())

Fold 1 - Accuracy =  0.7832
Fold 2 - Accuracy =  0.7872
Fold 3 - Accuracy =  0.7941
Fold 4 - Accuracy =  0.7992
Fold 5 - Accuracy =  0.7831
Overall accuracy =  0.7894


## Experiment 3: With `CabinNum` Label Encoded

In [31]:
# Make copies so that the original dataframes do not change
train_df_with_cabinnum_le = train_df.copy()
test_df_with_cabinnum_le = test_df.copy()


train_df_with_cabinnum_le["CabinNum"] = train_cabinnum
test_df_with_cabinnum_le["CabinNum"] = test_cabinnum

train_df_with_cabinnum_le, test_df_with_cabinnum_le = encode_features(
    train_df=train_df_with_cabinnum_le,
    test_df=test_df_with_cabinnum_le,
    bool_exclude=test_df.columns,
    label_encoding_cols=["CabinNum"],
    drop_and_restore=["kfold"],
)

In [32]:
train_df_with_cabinnum_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinNum,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,88,1.0,4


In [33]:
test_df_with_cabinnum_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinNum
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,60
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,88
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,77


In [34]:
train_df_exp3 = train(train_df_with_cabinnum_le.copy())

Fold 1 - Accuracy =  0.7849
Fold 2 - Accuracy =  0.7907
Fold 3 - Accuracy =  0.7959
Fold 4 - Accuracy =  0.7975
Fold 5 - Accuracy =  0.8021
Overall accuracy =  0.7942


## Experiment 4: With `GroupId` One-Hot Encoded

In [35]:
# Make copies so that the original dataframes do not change
train_df_with_groupid_onehot = train_df.copy()
test_df_with_groupid_onehot = test_df.copy()


train_df_with_groupid_onehot["GroupId"] = train_groupid
test_df_with_groupid_onehot["GroupId"] = test_groupid

train_df_with_groupid_onehot, test_df_with_groupid_onehot = encode_features(
    train_df=train_df_with_groupid_onehot,
    test_df=test_df_with_groupid_onehot,
    bool_exclude=test_df.columns,
    one_hot_cols=["GroupId"],
    drop_and_restore=["kfold"],
)

In [36]:
train_df_with_groupid_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupId_9273,GroupId_9274,GroupId_9275,GroupId_9276,GroupId_9277,GroupId_9278,GroupId_9279,GroupId_9280,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,4


In [37]:
test_df_with_groupid_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupId_9271,GroupId_9272,GroupId_9273,GroupId_9274,GroupId_9275,GroupId_9276,GroupId_9277,GroupId_9278,GroupId_9279,GroupId_9280
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
train_df_exp4 = train(train_df_with_groupid_onehot.copy())

Fold 1 - Accuracy =  0.7821
Fold 2 - Accuracy =  0.7878
Fold 3 - Accuracy =  0.7959
Fold 4 - Accuracy =  0.7957
Fold 5 - Accuracy =  0.7940
Overall accuracy =  0.7911


## Experiment 5: With `GroupId` Label Encoded

In [39]:
# Make copies so that the original dataframes do not change
train_df_with_groupid_le = train_df.copy()
test_df_with_groupid_le = test_df.copy()

train_df_with_groupid_le["GroupId"] = train_groupid
test_df_with_groupid_le["GroupId"] = test_groupid

train_df_with_groupid_le, test_df_with_groupid_le = encode_features(
    train_df=train_df_with_groupid_le,
    test_df=test_df_with_groupid_le,
    bool_exclude=test_df.columns,
    label_encoding_cols=["GroupId"],
    drop_and_restore=["kfold"],
)

In [40]:
train_df_with_groupid_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,GroupId,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3271,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8087,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2127,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2127,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6590,1.0,4


In [41]:
test_df_with_groupid_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,GroupId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3144
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3118
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3117
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3116
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3115


In [42]:
train_df_exp5 = train(train_df_with_groupid_le.copy())

Fold 1 - Accuracy =  0.7907
Fold 2 - Accuracy =  0.7884
Fold 3 - Accuracy =  0.7964
Fold 4 - Accuracy =  0.7940
Fold 5 - Accuracy =  0.7980
Overall accuracy =  0.7935


## Experiment 6: With `CabinNum` and `GroupId` One-Hot Encoded

In [43]:
# Make copies so that the original dataframes do not change
train_df_with_both_onehot = train_df.copy()
test_df_with_both_onehot = test_df.copy()


train_df_with_both_onehot["CabinNum"] = train_cabinnum
test_df_with_both_onehot["CabinNum"] = test_cabinnum

train_df_with_both_onehot["GroupId"] = train_groupid
test_df_with_both_onehot["GroupId"] = test_groupid

train_df_with_both_onehot, test_df_with_both_onehot = encode_features(
    train_df=train_df_with_both_onehot,
    test_df=test_df_with_both_onehot,
    bool_exclude=test_df.columns,
    one_hot_cols=["CabinNum", "GroupId"],
    drop_and_restore=["kfold"],
)

In [44]:
train_df_with_both_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupId_9273,GroupId_9274,GroupId_9275,GroupId_9276,GroupId_9277,GroupId_9278,GroupId_9279,GroupId_9280,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0,0,0,0,0,0,0,0,1.0,4


In [45]:
test_df_with_both_onehot.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupId_9271,GroupId_9272,GroupId_9273,GroupId_9274,GroupId_9275,GroupId_9276,GroupId_9277,GroupId_9278,GroupId_9279,GroupId_9280
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
train_df_exp6 = train(train_df_with_both_onehot.copy())

Fold 1 - Accuracy =  0.7832
Fold 2 - Accuracy =  0.7895
Fold 3 - Accuracy =  0.7878
Fold 4 - Accuracy =  0.8003
Fold 5 - Accuracy =  0.7854
Overall accuracy =  0.7893


## Experiment 7: With `CabinNum` and `GroupId` Label Encoded

In [47]:
# Make copies so that the original dataframes do not change
train_df_with_both_le = train_df.copy()
test_df_with_both_le = test_df.copy()


train_df_with_both_le["CabinNum"] = train_cabinnum
test_df_with_both_le["CabinNum"] = test_cabinnum

train_df_with_both_le["GroupId"] = train_groupid
test_df_with_both_le["GroupId"] = test_groupid

train_df_with_both_le, test_df_with_both_le = encode_features(
    train_df=train_df_with_both_le,
    test_df=test_df_with_both_le,
    bool_exclude=test_df.columns,
    label_encoding_cols=["CabinNum", "GroupId"],
    drop_and_restore=["kfold"],
)

In [48]:
train_df_with_both_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinNum,GroupId,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14,3271,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,14,8087,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14,2127,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14,2127,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,88,6590,1.0,4


In [49]:
test_df_with_both_le.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinNum,GroupId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1.0,-0.116957,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,60,3144
0018_01,0.0,-0.681236,0.0,-0.361266,-0.281826,-0.316173,2.255891,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3118
0019_01,1.0,0.165182,0.0,-0.361266,-0.287719,-0.316173,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14,3117
0021_01,0.0,0.658927,0.0,-0.361266,4.067167,-0.316173,-0.109263,0.219987,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,88,3116
0023_01,0.0,-0.610702,0.0,-0.34479,-0.287719,0.81623,-0.271297,-0.249197,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,77,3115


In [50]:
train_df_exp7 = train(train_df_with_both_le)

Fold 1 - Accuracy =  0.7815
Fold 2 - Accuracy =  0.7907
Fold 3 - Accuracy =  0.8033
Fold 4 - Accuracy =  0.7952
Fold 5 - Accuracy =  0.8061
Overall accuracy =  0.7954


# Experiment Conclusion

The experiments indicate that the best model is obtained when `CabinNum` and `GroupId` are included in the model and are label encoded.

|                   **Experiment**                   | **Fold 1** | **Fold 2** | **Fold 3** | **Fold 4** | **Fold 5** | **Overall** |
|:--------------------------------------------------:|:----------:|:----------:|:----------:|:----------:|:----------:|:-----------:|
|          Without `CabinNum` and `GroupId`          |   0.7867   |   0.7861   |   0.7936   |   0.7975   |   0.7969   |    0.7921   |
|           With only `CabinNum` (One-Hot)           |   0.7832   |   0.7872   |   0.7941   |   0.7992   |   0.7831   |    0.7894   |
|        With only `CabinNum` (Label Encoded)        |   0.7849   |   0.7907   |   0.7959   |   0.7975   |   0.8021   |    0.7942   |
|            With only `GroupId` (One-Hot)           |   0.7821   |   0.7878   |   0.7959   |   0.7957   |   0.7940   |    0.7911   |
|         With only `GroupId` (Label Encoded)        | **0.7907** |   0.7884   |   0.7964   |   0.7940   |   0.7980   |    0.7935   |
|    With both `CabinNum` and `GroupId` (One-Hot)    |   0.7832   |   0.7890   |   0.7878   | **0.8003** |   0.7854   |    0.7891   |
| With both `CabinNum` and `GroupId` (Label Encoded) |   0.7815   | **0.7907** | **0.8033** |   0.7952   | **0.8061** |  **0.7954** |