# Space Titanic

https://www.youtube.com/watch?v=BV03sQ0srcU

## Data Preprocess

### 1. Imports and load data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer 

In [31]:
df_train = pd.read_csv("data\\spaceship_titanic\\train.csv")
df_test = pd.read_csv("data\\spaceship_titanic\\test.csv")

In [32]:
df_train.iloc[0:3]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [33]:
df_test.iloc[0:3]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus


### 2. Adding the prediction collumn into the test df_train

In [34]:
df_test['Transported'] = False

In [36]:
df_test.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,False
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,False


### 3. Check that the train and test data are in the same format like has the same features and such.

In [45]:
print(pd.concat([df_train,df_test]).shape[0] == df_test.shape[0] + df_train.shape[0])

True


### 4. Identify all of the **None** Values 

In [47]:
df = pd.concat([df_train,df_test])

In [50]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
dtype: int64

### 5. Filled in the Nan Value In Cabin

In [54]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/', expand = True)

In [None]:
df = df.drop(columns='Cabin')

In [58]:
df.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S


In [59]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
Deck            299
Num             299
Side            299
dtype: int64

In [60]:
df["Deck"] = df["Deck"].fillna('U')

In [74]:
df["Deck"].value_counts()

Deck
F    4239
G    3781
E    1323
B    1141
C    1102
D     720
A     354
U     299
T      11
Name: count, dtype: int64

In [65]:
df["HomePlanet"].value_counts()

HomePlanet
Earth     6865
Europa    3133
Mars      2684
Name: count, dtype: int64

In [66]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
Deck              0
Num             299
Side            299
dtype: int64

In [68]:
df['Num'] = df['Num'].fillna(-1)
df["Side"] = df["Side"].fillna('U')

In [69]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
Deck              0
Num               0
Side              0
dtype: int64

### 6. Label Encoding new columns

In [75]:
df['Deck'] = df["Deck"].map({'G':0 , "F":1, "E":2, "D":3, "C":4, "B":5, "A":6, "U":7, "T":8})
df["Side"] = df["Side"].map({'U':-1 , "P":1, "S":2})

### 7. Drop Name column, since it will not give us any values 

In [81]:
df.drop(columns="Name", axis= 1, inplace=True)

In [89]:
impute_list = ['Age','VIP','RoomService' ,'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported','Deck' ,'Num', 'Side']
rest = list(set(df.columns)-set(impute_list))

In [99]:
df_rest = df[rest]
imp = KNNImputer(n_neighbors= 5)
df_imputed = imp.fit_transform(df[impute_list])
df_imputed.shape

(12970, 11)

In [100]:
df_imputed = pd.DataFrame(df_imputed, columns = impute_list)

In [101]:
df_imputed

Unnamed: 0,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0
1,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,1.0,0.0,2.0
2,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,6.0,0.0,2.0
3,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,6.0,0.0,2.0
4,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
12965,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1496.0,2.0
12966,42.0,0.0,0.0,847.0,17.0,10.0,144.0,0.0,7.0,-1.0,-1.0
12967,15.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,296.0,1.0
12968,30.2,0.0,0.0,2680.0,0.0,0.0,523.0,0.0,3.0,297.0,1.0


In [None]:
df = pd.concat([df_imputed.reset_index(drop=True), df_rest], axis=1)
df

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [104]:
df_rest

Unnamed: 0,CryoSleep,Destination,PassengerId,HomePlanet
0,False,TRAPPIST-1e,0001_01,Europa
1,False,TRAPPIST-1e,0002_01,Earth
2,False,TRAPPIST-1e,0003_01,Europa
3,False,TRAPPIST-1e,0003_02,Europa
4,False,TRAPPIST-1e,0004_01,Earth
...,...,...,...,...
4272,True,TRAPPIST-1e,9266_02,Earth
4273,False,TRAPPIST-1e,9269_01,Earth
4274,True,55 Cancri e,9271_01,Mars
4275,False,,9273_01,Europa


In [105]:
df_imputed

Unnamed: 0,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0
1,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,1.0,0.0,2.0
2,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,6.0,0.0,2.0
3,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,6.0,0.0,2.0
4,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
12965,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1496.0,2.0
12966,42.0,0.0,0.0,847.0,17.0,10.0,144.0,0.0,7.0,-1.0,-1.0
12967,15.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,296.0,1.0
12968,30.2,0.0,0.0,2680.0,0.0,0.0,523.0,0.0,3.0,297.0,1.0


In [1]:
import tensorflow as tf




In [2]:
import tensorflow_decision_forests as tfdf

ModuleNotFoundError: No module named 'tensorflow_decision_forests'