# Final Project
## Members
#### - Michael Conner
#### - Jackson Rolando
#### - Ryan Kruk
## Dataset
#### - Space Titanic - https://www.kaggle.com/competitions/spaceship-titanic/data
## Research Question
#### - Predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly using a set of personal records recovered from the ship's damaged computer system.
## Hypothesis

In [1]:
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('./data/train.csv')
df.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


- We could either bucket the ages or have them put as continuous
- We could have another variable for stuff like "used_room_service", "used_food_court", or bucket those columns as well

### Data Preprocessing

In [95]:
train = df.copy()

Split Passenger ID into the group and number within that group.

In [96]:
passenger_ids = train.PassengerId
group, num = zip(*[pid.split('_') for pid in passenger_ids])
train['PassengerId_Group'] = group
train['PassengerId_Num'] = num

Split Cabin into the deck, number, and side.

In [97]:
cabins = train.Cabin
deck, num, side = zip(*[cabin.split('/') if type(cabin) != float else (cabin, cabin, cabin) for cabin in cabins])
train['Cabin_Deck'] = deck
train['Cabin_Num'] = num
train['Cabin_Side'] = side

Split Name into first and last name.

In [98]:
names = train.Name
first, last = zip(*[name.split(' ') if type(name) != float else (name, name) for name in names])
train['FirstName'] = first
train['LastName'] = last

In [74]:
categorical_columns = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']

In [75]:
train['HomePlanet'] = train['HomePlanet'].fillna('Unknown')
train['CryoSleep'] = train['CryoSleep'].fillna(False)
train['Destination'] = train['Destination'].fillna('TRAPPIST-1e')
train['VIP'] = train['VIP'].fillna('False')

In [76]:
for cat in categorical_columns:
    codes, categories = pd.factorize(train[cat])

In [77]:
categories_to_int = {
    cat: {val:i for i,val in enumerate(train[cat].unique())} for cat in categorical_columns
}
int_to_categories = {
    cat: {i:val for i,val in enumerate(train[cat].unique())} for cat in categorical_columns
}

In [78]:
for cat in categorical_columns:
    train[cat] = train[cat].apply(lambda x: categories_to_int[cat][x])

In [79]:
ohe_arrs = {}
for cat in categorical_columns:
    ohe_arrs[cat] = np.eye(train[cat].unique().shape[0])[train[cat].to_numpy()]

In [80]:
continuous_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [81]:
for col in continuous_columns:
    train[col].fillna(0)
    train[col] = train[col]/train[col].max()

In [82]:
samples = np.hstack([
    np.hstack([ohe_arrs[cat] for cat in categorical_columns]),
    np.hstack([np.expand_dims(np.nan_to_num(train[col]), axis=1) for col in continuous_columns])
])
labels = np.eye(2)[train['Transported'].astype(int)]
labels = train['Transported'].astype(int).to_numpy()

In [83]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerId_Group,PassengerId_Num,Cabin_Deck,Cabin_Num,Cabin_Side
0,0001_01,0,0,0,0,0.493671,0,0.000000,0.000000,0.000000,0.000000,0.000000,Maham Ofracculy,False,0001,01,0,0,0
1,0002_01,1,0,1,0,0.303797,0,0.007608,0.000302,0.001064,0.024500,0.001823,Juanna Vines,True,0002,01,1,0,1
2,0003_01,0,0,2,0,0.734177,1,0.003001,0.119948,0.000000,0.299670,0.002030,Altark Susent,False,0003,01,2,0,1
3,0003_02,0,0,2,0,0.417722,0,0.000000,0.043035,0.015793,0.148563,0.007997,Solam Susent,False,0003,02,2,0,1
4,0004_01,1,0,3,0,0.202532,0,0.021149,0.002348,0.006428,0.025214,0.000083,Willy Santantines,True,0004,01,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,0,6557,2,0.518987,1,0.000000,0.228726,0.000000,0.073322,0.003066,Gravior Noxnuther,False,9276,01,2,98,0
8689,9278_01,1,1,6558,1,0.227848,0,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta Mondalley,False,9278,01,3,1499,1
8690,9279_01,1,0,6559,0,0.329114,0,0.000000,0.000000,0.079687,0.000045,0.000000,Fayey Connon,True,9279,01,3,1500,1
8691,9280_01,0,0,6560,2,0.405063,0,0.000000,0.035186,0.000000,0.015753,0.134049,Celeon Hontichre,False,9280,01,5,608,1


#### Make a Model

In [None]:
inputs = tf.keras.layers.Input(samples.shape[1], dtype=float)
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(), 
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=tf.keras.metrics.BinaryAccuracy()
)

#### Train it

In [None]:
model.fit(x=samples, y=labels, batch_size=4, epochs=1, shuffle=True, validation_split=0.1)