# Spaceship Titanic

In [2]:
import pandas as pd
import plotly

In [3]:
train_csv = "../data/train.csv"
test_csv = "../data/test.csv"

In [4]:
train_df = pd.read_csv(train_csv)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [6]:
train_df.nunique(dropna=False)

PassengerId     8693
HomePlanet         4
CryoSleep          3
Cabin           6561
Destination        4
Age               81
VIP                3
RoomService     1274
FoodCourt       1508
ShoppingMall    1116
Spa             1328
VRDeck          1307
Name            8474
Transported        2
dtype: int64

In [7]:
train_df.Destination.value_counts()

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64

In [8]:
for col in train_df.columns:
    if (train_df[col].nunique(dropna=False)) < 10:
        display(pd.DataFrame((train_df[col].value_counts(dropna=False))))

Unnamed: 0_level_0,count
HomePlanet,Unnamed: 1_level_1
Earth,4602
Europa,2131
Mars,1759
,201


Unnamed: 0_level_0,count
CryoSleep,Unnamed: 1_level_1
False,5439
True,3037
,217


Unnamed: 0_level_0,count
Destination,Unnamed: 1_level_1
TRAPPIST-1e,5915
55 Cancri e,1800
PSO J318.5-22,796
,182


Unnamed: 0_level_0,count
VIP,Unnamed: 1_level_1
False,8291
,203
True,199


Unnamed: 0_level_0,count
Transported,Unnamed: 1_level_1
True,4378
False,4315


### Initial ideas
* Most columns need to be cleaned of NaN values
    * Try dropping them, or replacing them with 'None'
    * Some models can interpret these values
    * But also want to try Predictive Imputing of missing values
* How does PassengerID work?
    * Would be interesting to look at how many groups there are; split this into an extra column so that you have GroupID and passengerID
* How does Cabin work? 
    * Would be interesting to see how the Deck, Number and Side (Port or Starboard) affected the results too.
* Should Age be made discrete?
* Should money spent on the RoomService, FoodCourt, Shoppingmall, Spa, VRDeck be made discrete? 
    * do some people spend nothing? 
    * should each column be put into a range, or the sum of the columns make a range?
* How many people were transported, as a percentage?


In [9]:
## "How does PassengerId work?"
train_df["GroupId"] = train_df["PassengerId"].apply(lambda row: row.split("_")[0])
desired_column_order = ["Name", "GroupId", "PassengerId", "HomePlanet", "CryoSleep", "Cabin", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Transported"]
train_df = train_df[desired_column_order]
train_df.nunique(dropna=False)

train_df.head(1)

Unnamed: 0,Name,GroupId,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Maham Ofracculy,1,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False


In [10]:
## "How does Cabin work?"
extra_cabin_cols = train_df.Cabin.str.split("/", expand=True).rename(columns={0: "Deck", 1:"Number", 2:"Side"})

train_df = train_df.assign(
    Deck=extra_cabin_cols["Deck"],
    Number=extra_cabin_cols["Number"],
    Side=extra_cabin_cols["Side"],
)

desired_column_order = [
    "Name", "GroupId", "PassengerId", 
    "HomePlanet", "CryoSleep", "Cabin", 
    "Deck", "Number", "Side", "Destination", 
    "Age", "VIP", "RoomService", "FoodCourt", 
    "ShoppingMall", "Spa", "VRDeck", "Transported"
    ]

train_df = train_df[desired_column_order]
train_df.head(1)

Unnamed: 0,Name,GroupId,PassengerId,HomePlanet,CryoSleep,Cabin,Deck,Number,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Maham Ofracculy,1,0001_01,Europa,False,B/0/P,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False


In [11]:
## "How many people were Transported?"


### Unrelated questions

* Is there a relationship between VIP status and $ spent on amenities? 
* Is there a relationship between location on the ship and Transported?

# Conversation with Matt

* If someone is non-cryo or cryo is important -- as cryo aren't moving around, if they are transported it gives some indication to where the anomaly intersected the ship. 
* Determine if FoodCourt, ShoppingMall, Spa, VRDeck intersect the anomaly independently (might be in different areas of the ship)
* If a non-cyro was transported, you can determine their probabilistic location at the time of the accident according to their spend -- if they spend the majority of the money on RoomService they are probably in their cabin. 
* Could determine if the passengers who were likely to be at each activity were transported, and then these activities/areas also likely intersect the anomaly. 
* For NaN values, you are really assigning a risk score for each passenger, e.g. if you have a row with only a 10% intersection with the anomaly, and the cabin number of the passenger is not present, then they are likely a survivor. 
* For cabin number/row/side you can assign a set of values that would be valid by determing which cabins are unoccupied.
