# Spaceship Titanic


In [None]:
%conda install kaggle pandas matplotlib seaborn scikit-learn tensorflow=2.12

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '../..'
data_dir = '../../data/spaceship-titanic'
!chmod 600 ../../kaggle.json

In [None]:
!kaggle competitions download -c spaceship-titanic -p {data_dir}
!unzip -o {data_dir}/spaceship-titanic.zip -d {data_dir}

In [2]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Normalization, InputLayer, Dropout
from tensorflow.keras.regularizers import l2

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-02-26 02:38:18.347388: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 02:38:18.347456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 02:38:18.470692: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

In [3]:
data = pd.read_csv(f'{data_dir}/train.csv', index_col='Transported')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
data = data.drop(columns=['Name', 'PassengerId'])
data = data.dropna(axis=0, how='any')
data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 6764 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6764 non-null   object 
 1   CryoSleep     6764 non-null   object 
 2   Cabin         6764 non-null   object 
 3   Destination   6764 non-null   object 
 4   Age           6764 non-null   float64
 5   VIP           6764 non-null   object 
 6   RoomService   6764 non-null   float64
 7   FoodCourt     6764 non-null   float64
 8   ShoppingMall  6764 non-null   float64
 9   Spa           6764 non-null   float64
 10  VRDeck        6764 non-null   float64
 11  Transported   6764 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 640.7+ KB


In [5]:
def embed(data, column):
    data[column] = pd.Categorical(data[column])
    data[column] = data[column].cat.codes
    return data

def embed_all(data):
    for column in data.columns:
        if data[column].dtype == 'object' or data[column].dtype == 'bool':
            data = embed(data, column)
    return data

In [6]:
data = embed_all(data)
data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,0,138,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,1854,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1
2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0
3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0
4,0,0,1856,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1


In [7]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)
train_x = train.drop(columns=['Transported'], axis=1)
train_y = train['Transported']
valid_x = valid.drop(columns=['Transported'], axis=1)
valid_y = valid['Transported']

In [12]:
train_x.shape

(5411, 11)

In [64]:
spending = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
# get the sum of spending and create a new column
train_x['TotalSpending'] = train_x[spending].sum(axis=1)
valid_x['TotalSpending'] = valid_x[spending].sum(axis=1)
# drop the original columns
train_x = train_x.drop(columns=spending)
valid_x = valid_x.drop(columns=spending)

In [68]:
model = Sequential(
    [
        InputLayer(input_shape=(train_x.shape[1],)),
        Normalization(),
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1)
    ]
)

model.summary()
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    0.00001,
    decay_steps=train_x.shape[0] / 270 * 1000,
    decay_rate=1,
    staircase=False
)
optimizer = tf.keras.optimizers.Adam(lr_schedule)
model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=tf.keras.metrics.BinaryAccuracy())

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_14 (Normaliz  (None, 7)                 15        
 ation)                                                          
                                                                 
 dense_56 (Dense)            (None, 64)                512       
                                                                 
 dropout_42 (Dropout)        (None, 64)                0         
                                                                 
 dense_57 (Dense)            (None, 32)                2080      
                                                                 
 dropout_43 (Dropout)        (None, 32)                0         
                                                                 
 dense_58 (Dense)            (None, 16)                528       
                                                     

In [66]:
train_ds = tf.data.Dataset.from_tensor_slices((train_x.values, train_y.values)).batch(270, drop_remainder=True)
valid_ds = tf.data.Dataset.from_tensor_slices((valid_x.values, valid_y.values)).batch(270)
print(train_ds.element_spec)

(TensorSpec(shape=(270, 7), dtype=tf.float64, name=None), TensorSpec(shape=(270,), dtype=tf.int8, name=None))


In [71]:
metrics = model.fit(train_ds, validation_data=valid_ds, epochs=4400, steps_per_epoch=20)

Epoch 1/4400
Epoch 2/4400
Epoch 3/4400
Epoch 4/4400
Epoch 5/4400
Epoch 6/4400
Epoch 7/4400
Epoch 8/4400
Epoch 9/4400
Epoch 10/4400
Epoch 11/4400
Epoch 12/4400
Epoch 13/4400
Epoch 14/4400
Epoch 15/4400
Epoch 16/4400
Epoch 17/4400
Epoch 18/4400
Epoch 19/4400
Epoch 20/4400
Epoch 21/4400
Epoch 22/4400
Epoch 23/4400
Epoch 24/4400
Epoch 25/4400
Epoch 26/4400
Epoch 27/4400
Epoch 28/4400
Epoch 29/4400
Epoch 30/4400
Epoch 31/4400
Epoch 32/4400
Epoch 33/4400
Epoch 34/4400
Epoch 35/4400
Epoch 36/4400
Epoch 37/4400
Epoch 38/4400
Epoch 39/4400
Epoch 40/4400
Epoch 41/4400
Epoch 42/4400
Epoch 43/4400
Epoch 44/4400
Epoch 45/4400
Epoch 46/4400
Epoch 47/4400
Epoch 48/4400
Epoch 49/4400
Epoch 50/4400
Epoch 51/4400
Epoch 52/4400
Epoch 53/4400
Epoch 54/4400
Epoch 55/4400
Epoch 56/4400
Epoch 57/4400
Epoch 58/4400
Epoch 59/4400
Epoch 60/4400
Epoch 61/4400
Epoch 62/4400
Epoch 63/4400
Epoch 64/4400
Epoch 65/4400
Epoch 66/4400
Epoch 67/4400
Epoch 68/4400
Epoch 69/4400
Epoch 70/4400
Epoch 71/4400
Epoch 72/4400
E

In [55]:
test = pd.read_csv(f'{data_dir}/test.csv')
test.drop(columns=['Name', 'PassengerId'], inplace=True)
test = embed_all(test)

In [56]:
submission = pd.read_csv(f'{data_dir}/sample_submission.csv')
submission['Transported'] = model.predict(test)
submission.head()



Unnamed: 0,PassengerId,Transported
0,0013_01,0.141508
1,0018_01,0.085787
2,0019_01,0.190118
3,0021_01,0.037451
4,0023_01,0.158816


In [57]:
for i in range(submission.shape[0]):
    if submission.loc[i, 'Transported'] > 0.5:
        submission.loc[i, 'Transported'] = 'True'
    else:
        submission.loc[i, 'Transported'] = 'False'

submission.head()

  submission.loc[i, 'Transported'] = 'False'


Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [50]:
submission.to_csv(f'{data_dir}/submission.csv', index=False)

In [51]:
!kaggle competitions submit -c spaceship-titanic -f {data_dir}/submission.csv -m "Larger nn with tf."

100%|███████████████████████████████████████| 54.9k/54.9k [00:00<00:00, 109kB/s]
Successfully submitted to Spaceship Titanic