# Titanic spaceship kaggle competition

## Initialisation

In [1]:
import numpy as np
import pandas as pd
import kaggle
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
REPO_DATA = 'data'

In [3]:
if not os.path.exists(REPO_DATA):
    os.mkdir(REPO_DATA)

## Load data

In [4]:
import zipfile

if not os.path.exists(os.path.join(REPO_DATA, 'train.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'test.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'sample_submission.csv')):
    
    !kaggle competitions download -c spaceship-titanic -p $REPO_DATA
    
    with zipfile.ZipFile(os.path.join(REPO_DATA, 'spaceship-titanic.zip'), 'r') as zip_ref:
        zip_ref.extractall(REPO_DATA)
        
df_train = pd.read_csv(os.path.join(REPO_DATA, 'train.csv'))
df_test = pd.read_csv(os.path.join(REPO_DATA, 'test.csv'))

In [5]:
df_train.head(4)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False


In [6]:
df_test.head(4)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter


In [7]:
print("There is {n_rows_train} rows and {n_cols_train} columns in the train dataframe."
      .format(n_rows_train=df_train.shape[0], n_cols_train=df_train.shape[1]))
print("There is {n_rows_test} rows and {n_cols_test} columns in the test dataframe."
      .format(n_rows_test=df_test.shape[0], n_cols_test=df_test.shape[1]))

There is 8693 rows and 14 columns in the train dataframe.
There is 4277 rows and 13 columns in the test dataframe.


## Preprocessing

### Split feature

In [8]:
df_train["GroupId"] = df_train.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[0])
df_train["PassengerGroupNumber"] = df_train.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[1])

df_test["GroupId"] = df_test.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[0])
df_test["PassengerGroupNumber"] = df_test.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[1])

In [9]:
df_train["CabinDeck"] = df_train.Cabin.apply(lambda cabin: cabin.split('/')[0] if pd.notna(cabin) else cabin)
df_train["CabinNum"] = df_train.Cabin.apply(lambda cabin: cabin.split('/')[1] if pd.notna(cabin) else cabin)
df_train["CabinSide"] = df_train.Cabin.apply(lambda cabin: cabin.split('/')[2] if pd.notna(cabin) else cabin)

df_test["CabinDeck"] = df_test.Cabin.apply(lambda cabin: cabin.split('/')[0] if pd.notna(cabin) else cabin)
df_test["CabinNum"] = df_test.Cabin.apply(lambda cabin: cabin.split('/')[1] if pd.notna(cabin) else cabin)
df_test["CabinSide"] = df_test.Cabin.apply(lambda cabin: cabin.split('/')[2] if pd.notna(cabin) else cabin)

### Fill na

In [10]:
group_home_planet_mapping = {row.GroupId: row.HomePlanet for _, row in pd.concat([df_train, df_test]) \
                                                                         .groupby(by=["GroupId"], axis=0, as_index=False)["HomePlanet"] \
                                                                         .first()
                                                                         .iterrows()}

df_train["HomePlanet"] = df_train.apply(lambda row: row.HomePlanet if pd.notna(row.HomePlanet) else group_home_planet_mapping.get(row.GroupId, row.HomePlanet), axis=1)
df_test["HomePlanet"] = df_test.apply(lambda row: row.HomePlanet if pd.notna(row.HomePlanet) else group_home_planet_mapping.get(row.GroupId, row.HomePlanet), axis=1)

In [11]:
def complete_missing_home_planet_using_deck(row):
    if pd.notna(row.HomePlanet):
        home_planet = row.HomePlanet
    elif pd.isna(row.CabinDeck):
        home_planet = row.HomePlanet
    elif row.CabinDeck in ['A', 'B', 'C', 'T']:
        home_planet = 'Europa'
    elif row.CabinDeck in ['G']:
        home_planet = 'Earth'
    else:
        home_planet = row.HomePlanet
    return home_planet

df_train.HomePlanet = df_train.apply(complete_missing_home_planet_using_deck, axis=1)
df_test.HomePlanet = df_test.apply(complete_missing_home_planet_using_deck, axis=1)

In [12]:
df_train.CryoSleep = df_train.CryoSleep.fillna(False)
df_test.CryoSleep = df_test.CryoSleep.fillna(False)

In [13]:
mean_age = pd.concat([df_train, df_test]).Age.mean()
df_train.Age = df_train.Age.fillna(mean_age)
df_test.Age = df_test.Age.fillna(mean_age)

In [14]:
df_train.VIP = df_train.VIP.fillna(False)
df_test.VIP = df_test.VIP.fillna(False)

In [19]:
df_train.RoomService = df_train.RoomService.fillna(0)
df_train.FoodCourt = df_train.FoodCourt.fillna(0)
df_train.Spa = df_train.Spa.fillna(0)
df_train.ShoppingMall = df_train.ShoppingMall.fillna(0)
df_train.VRDeck = df_train.VRDeck.fillna(0)

df_test.RoomService = df_test.RoomService.fillna(0)
df_test.FoodCourt = df_test.FoodCourt.fillna(0)
df_test.Spa = df_test.Spa.fillna(0)
df_test.ShoppingMall = df_test.ShoppingMall.fillna(0)
df_test.VRDeck = df_test.VRDeck.fillna(0)

In [20]:
df_train.isna().sum()

PassengerId               0
HomePlanet               63
CryoSleep                 0
Cabin                   199
Destination             182
Age                       0
VIP                       0
RoomService               0
FoodCourt                 0
ShoppingMall              0
Spa                       0
VRDeck                    0
Name                    200
Transported               0
GroupId                   0
PassengerGroupNumber      0
CabinDeck               199
CabinNum                199
CabinSide               199
dtype: int64

### Hot encoding

In [21]:
df_train = pd.get_dummies(df_train, columns=["HomePlanet", "Destination", "CabinDeck", "CabinSide"])
df_test = pd.get_dummies(df_test, columns=["HomePlanet", "Destination", "CabinDeck", "CabinSide"])

In [23]:
df_train.drop(columns=["Cabin", "Name", "CabinNum"], inplace=True)
df_test.drop(columns=["Cabin", "Name", "CabinNum"], inplace=True)

In [24]:
df_train.isna().sum()

PassengerId                  0
CryoSleep                    0
Age                          0
VIP                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Transported                  0
GroupId                      0
PassengerGroupNumber         0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
CabinDeck_A                  0
CabinDeck_B                  0
CabinDeck_C                  0
CabinDeck_D                  0
CabinDeck_E                  0
CabinDeck_F                  0
CabinDeck_G                  0
CabinDeck_T                  0
CabinSide_P                  0
CabinSide_S                  0
dtype: int64

### Transformation

In [26]:
from scipy.stats import yeojohnson

df_train.RoomService = df_train.RoomService.apply(lambda bill: np.log(bill+1))
df_train.FoodCourt = df_train.FoodCourt.apply(lambda bill: np.log(bill+1))
df_train.ShoppingMall = df_train.ShoppingMall.apply(lambda bill: np.log(bill+1))
df_train.Spa = df_train.Spa.apply(lambda bill: np.log(bill+1))
df_train.VRDeck = df_train.VRDeck.apply(lambda bill: np.log(bill+1))
df_train.Age, lmbda = yeojohnson(df_train.Age)

df_test.RoomService = df_test.RoomService.apply(lambda bill: np.log(bill+1))
df_test.FoodCourt = df_test.FoodCourt.apply(lambda bill: np.log(bill+1))
df_test.ShoppingMall = df_test.ShoppingMall.apply(lambda bill: np.log(bill+1))
df_test.Spa = df_test.Spa.apply(lambda bill: np.log(bill+1))
df_test.VRDeck = df_test.VRDeck.apply(lambda bill: np.log(bill+1))
df_test.Age = yeojohnson(df_test.Age, lmbda=lmbda)

In [27]:
X = df_train[["RoomService", "CryoSleep"]]

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_new = scaler.fit_transform(X)

In [31]:
pd.DataFrame(X_new)

Unnamed: 0,0,1
0,-0.680981,-0.732770
1,1.344645,-0.732770
2,1.140711,-0.732770
3,-0.680981,-0.732770
4,1.535617,-0.732770
...,...,...
8688,-0.680981,-0.732770
8689,-0.680981,1.364685
8690,-0.680981,-0.732770
8691,-0.680981,-0.732770
