In [152]:
import re
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

In [175]:
BASE_PATH = "/kaggle/input/spaceship-titanic/"
TRAIN_CSV_PATH = BASE_PATH + "train.csv"
TEST_CSV_PATH = BASE_PATH + "test.csv"
OUTPUT_PATH = "/kaggle/working"

In [154]:
train_df = pd.read_csv(TRAIN_CSV_PATH, )
test_df = pd.read_csv(TEST_CSV_PATH)

train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [155]:
unique_series = train_df.nunique().rename('unique_count')
na_series = train_df.isna().sum().rename('na_count')
type_series = train_df.dtypes.rename('type')

train_info_df = pd.concat([unique_series, na_series, type_series], axis=1)
train_info_df

Unnamed: 0,unique_count,na_count,type
PassengerId,8693,0,object
HomePlanet,3,201,object
CryoSleep,2,217,object
Cabin,6560,199,object
Destination,3,182,object
Age,80,179,float64
VIP,2,203,object
RoomService,1273,181,float64
FoodCourt,1507,183,float64
ShoppingMall,1115,208,float64


In [156]:
bool_features = ["CryoSleep", "VIP"]

train_df["Transported"] = train_df["Transported"].astype(int)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


In [157]:
def split_cabin(df):
    cabin_df = df["Cabin"].str.split("/", expand=True).rename(columns={
        0: "CabinDeck",
        1: "CabinNumber",
        2: "CabinSide"
    })
    return pd.concat([df.drop(["Cabin"], axis=1), cabin_df], axis=1)

train_df = split_cabin(train_df)
test_df = split_cabin(test_df)

train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,CabinDeck,CabinNumber,CabinSide
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S


In [158]:
def encode_features(df):
    features_to_encode = ["HomePlanet", "Destination", "CabinDeck", "CabinSide", *bool_features]
    encoded_features_df = pd.get_dummies(df[features_to_encode].replace(" ", "-", regex=True), prefix_sep="")
    
    return pd.concat([df.drop(features_to_encode, axis=1), encoded_features_df], axis=1)

train_df = encode_features(train_df)
test_df = encode_features(test_df)

train_df.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,CabinNumber,...,CabinDeckE,CabinDeckF,CabinDeckG,CabinDeckT,CabinSideP,CabinSideS,CryoSleepFalse,CryoSleepTrue,VIPFalse,VIPTrue
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0,...,0,0,0,0,1,0,1,0,1,0
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,0,...,0,1,0,0,0,1,1,0,1,0
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,0,...,0,0,0,0,0,1,1,0,0,1
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,0,...,0,0,0,0,0,1,1,0,1,0
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,1,...,0,1,0,0,0,1,1,0,1,0


In [159]:
def cast_numerics(df):
    numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    df[numeric_features] = df[numeric_features].apply(pd.to_numeric, errors="coerce")

cast_numerics(train_df)
cast_numerics(test_df)

train_df.isna().sum()

PassengerId                   0
Age                         179
RoomService                 181
FoodCourt                   183
ShoppingMall                208
Spa                         183
VRDeck                      188
Name                        200
Transported                   0
CabinNumber                 199
HomePlanetEarth               0
HomePlanetEuropa              0
HomePlanetMars                0
Destination55-Cancri-e        0
DestinationPSO-J318.5-22      0
DestinationTRAPPIST-1e        0
CabinDeckA                    0
CabinDeckB                    0
CabinDeckC                    0
CabinDeckD                    0
CabinDeckE                    0
CabinDeckF                    0
CabinDeckG                    0
CabinDeckT                    0
CabinSideP                    0
CabinSideS                    0
CryoSleepFalse                0
CryoSleepTrue                 0
VIPFalse                      0
VIPTrue                       0
dtype: int64

In [160]:
px.imshow(train_df.corr())

In [161]:
train_df.corr()['Transported'].abs().sort_values().rename("TransportedCorr").to_frame().drop(["Transported"])

Unnamed: 0,TransportedCorr
DestinationPSO-J318.5-22,9.2e-05
CabinDeckA,0.002623
ShoppingMall,0.010141
CabinDeckT,0.014568
CabinDeckG,0.016269
HomePlanetMars,0.019544
VIPFalse,0.024602
CabinDeckD,0.034046
VIPTrue,0.037261
FoodCourt,0.046566


In [162]:
drop_features = ["PassengerId", "Name"]

test_series_passenger_ids = test_df["PassengerId"]

train_df = train_df.drop(drop_features, axis=1)
test_df = test_df.drop(drop_features, axis=1)

train_df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinNumber,HomePlanetEarth,HomePlanetEuropa,...,CabinDeckE,CabinDeckF,CabinDeckG,CabinDeckT,CabinSideP,CabinSideS,CryoSleepFalse,CryoSleepTrue,VIPFalse,VIPTrue
0,39.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,...,0,0,0,0,1,0,1,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1,0,1,0,...,0,1,0,0,0,1,1,0,1,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1,...,0,0,0,0,0,1,1,0,1,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1,1,1,0,...,0,1,0,0,0,1,1,0,1,0


In [163]:
continuous_features = [
    "Age",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck"
]

scaler = MinMaxScaler()

X_train = train_df.drop(['Transported'], axis=1)
y_train = train_df['Transported']

X_train_encoded = scaler.fit_transform(X_train[continuous_features])
X_train_cont = scaler.fit_transform(X_train.drop(continuous_features, axis=1))

X_train = np.concatenate([X_train_encoded, X_train_cont], axis=1)

In [164]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train = imp.fit_transform(X_train)
X_train

array([[4.93670886e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [3.03797468e-01, 7.60801284e-03, 3.01881729e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [7.34177215e-01, 3.00132617e-03, 1.19947674e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [3.29113924e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [4.05063291e-01, 0.00000000e+00, 3.51859927e-02, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.56962025e-01, 8.79458365e-03, 1.57246839e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [165]:
# model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 64, 32, 16, 8, 4, 2), random_state=1)
model = SVC()

model.fit(X_train, y_train)

SVC()

In [166]:
X_test_encoded = scaler.fit_transform(test_df[continuous_features])
X_test_cont = scaler.fit_transform(test_df.drop(continuous_features, axis=1))

X_test = np.concatenate([X_test_encoded, X_test_cont], axis=1)

X_test = imp.fit_transform(X_test)

X_test

array([[3.41772152e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.40506329e-01, 0.00000000e+00, 3.56111265e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [3.92405063e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [3.62761344e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [3.62761344e-01, 0.00000000e+00, 1.06042021e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.44303797e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [167]:
pred = model.predict(X_test)
pred

array([1, 0, 1, ..., 1, 0, 1])

In [168]:
pred_df = pd.DataFrame(pred, columns=["Transported"])
pred_df["Transported"] = pred_df["Transported"].astype(bool)
pred_df.head()

Unnamed: 0,Transported
0,True
1,False
2,True
3,True
4,False


In [176]:
submission_df = pd.concat([test_series_passenger_ids, pred_df], axis=1)
submission_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [177]:
submission_df.to_csv(OUTPUT_PATH + "/submission.csv", index=False)