# Spaceship Titanic


In [None]:
%pip install kaggle
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install scikit-learn
%pip install tensorflow[and-cuda] #  remove [and-cuda] if you don't have a GPU

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '..'
data_dir = '../data/spaceship-titanic'
!chmod 600 ../kaggle.json
!kaggle competitions download -c spaceship-titanic -p {data_dir}
!unzip -o {data_dir}/spaceship-titanic.zip -d {data_dir}

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Normalization, InputLayer, Dropout
from tensorflow.keras.regularizers import l2

In [None]:
data = pd.read_csv(f'{data_dir}/train.csv')
data.info()

In [None]:
data = data.drop(columns=['Name', 'PassengerId'])
data = data.dropna(axis=0, how='any')
data.info()

In [None]:
def embed(data, column):
    data[column] = pd.Categorical(data[column])
    data[column] = data[column].cat.codes
    return data

def embed_all(data):
    for column in data.columns:
        if data[column].dtype == 'object' or data[column].dtype == 'bool':
            data = embed(data, column)
    return data

In [None]:
data = embed_all(data)
data.head()

In [None]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)
train_x = train.drop(columns=['Transported'], axis=1)
train_y = train['Transported']
valid_x = valid.drop(columns=['Transported'], axis=1)
valid_y = valid['Transported']

In [None]:
train_x.info()

In [None]:
model = Sequential(
    [
        InputLayer(input_shape=(train_x.shape[1],)),
        Normalization(),
        Dense(11, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(5, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ]
)

model.summary()

In [None]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=train_x.shape[0] / 270 * 1000,
    decay_rate=1,
    staircase=False
)
optimizer = tf.keras.optimizers.Adam(lr_schedule)
model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
metrics = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=5411, steps_per_epoch=20, batch_size=270)

In [None]:
test = pd.read_csv(f'{data_dir}/test.csv')
test.drop(columns=['Name', 'PassengerId'], inplace=True)
test = embed_all(test)

In [None]:
submission = pd.read_csv(f'{data_dir}/sample_submission.csv')
submission['Transported'] = model.predict(test)
for i in range(submission.shape[0]):
    if submission.loc[i, 'Transported'] > 0.5:
        submission.loc[i, 'Transported'] = 'True'
    else:
        submission.loc[i, 'Transported'] = 'False'

submission.head()

In [None]:
submission.to_csv(f'{data_dir}/submission.csv', index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f {data_dir}/submission.csv -m "First submission with tf."