# Kaggle - Spaceship Titanic
DATA PREPROCESSING

In [60]:
#import libraries
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
from random import shuffle
import os
from os import path
import time
import warnings

### Preprocess data for Training and Model Selection (Train & Validation & Test)

TRAIN.SCV

In [61]:
#import dataset
X = pd.read_csv('../data/raw/train.csv')

#split dataset into train and test
X_train_input, X_test_input = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

#add spendings column
'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'
X_train_input['Spendings'] = X_train_input['RoomService'] + X_train_input['FoodCourt'] + X_train_input['ShoppingMall'] + X_train_input['Spa'] + X_train_input['VRDeck']
X_test_input['Spendings'] = X_test_input['RoomService'] + X_test_input['FoodCourt'] + X_test_input['ShoppingMall'] + X_test_input['Spa'] + X_test_input['VRDeck']

#get parameters for normalization --> avoid data leekage
norm_paras = X_train_input.agg(['mean', 'std', 'median'])
print(norm_paras)

        PassengerId  CryoSleep        Age       VIP  RoomService    FoodCourt  \
mean            inf   0.354139  28.828093  0.022968   222.822935   456.714705   
std             NaN   0.478287  14.446399  0.149813   674.900407  1574.797221   
median     461251.0   0.000000  27.000000  0.000000     0.000000     0.000000   

        ShoppingMall          Spa       VRDeck  Transported    Spendings  
mean      175.499043   315.693842   304.189769     0.503307  1480.307361  
std       613.969158  1118.074541  1170.639327     0.500025  2816.389160  
median      0.000000     0.000000     0.000000     1.000000   739.000000  


Preprocessing pipeline

In [62]:
def preprocess(X, norm_paras=norm_paras):

    warnings.filterwarnings('ignore')


    #extract y
    if 'Transported' in X.columns:
        y = X['Transported'].to_numpy()
        X = X.drop(labels="Transported", axis=1)
    else:    y = None

    length = X.shape[0]
    print(f'sample size: {length}')

    #get familiy size
    X['Name'] = X['Name'].fillna('Unknown')
    X['Name'] = [name.split()[-1] for name in X['Name']]
    name_occurence = X['Name'].value_counts(dropna=False)
    X['FamilySize'] = [name_occurence[x]/20 for x in X['Name']] # divide by 20 to get values between 0 and 1

    #cabin details
    X['Cabin'] = X['Cabin'].fillna('F/X/S') # fill missing values with most common value
    X['Cabin'] = [x.split('/') for x in X['Cabin']]

    X['C_deck'] = [0 for x in range(length)] #group B/C and rest --> binary
    X['C_side'] = [0 for x in range(length)] # --> binary

    for sample in range(length):
        X['C_deck'][sample] = X['Cabin'].iloc[sample][0]
        X['C_side'][sample] = X['Cabin'].iloc[sample][2]

    X['C_deck'] = X['C_deck'].replace(['B', 'C'], 0)
    X['C_deck'] = X['C_deck'].replace(['A', 'D', 'E', 'F', 'G', 'T'], 1)
    X['C_side'] = X['C_side'].replace(['S'], 0)
    X['C_side'] = X['C_side'].replace(['P'], 1)

    #one-hot-encode categorical variables
    X['HomePlanet'].fillna('Earth', inplace=True)
    X['Destination'].fillna('TRAPPIST-1e', inplace=True)
    X = pd.get_dummies(X, columns=['HomePlanet', 'Destination'])

    #fill missing values for binary variables
    binaries = ['CryoSleep', 'VIP', 'C_deck', 'C_side']
    for col in binaries:
        X[col].fillna(False, inplace=True)
        X[col] = np.int32(X[col])

    #normalize numerical variables and fill missing values
    numerical = ['Age', 'Spendings', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in numerical:
        X[col].fillna(norm_paras[col]['median'], inplace=True)
        X[col] = (X[col] - norm_paras[col]['mean']) / norm_paras[col]['std']
        X[col] = np.float32(X[col])

    #drop columns
    drops = ['PassengerId', 'Name', 'Cabin']
    X = X.drop(labels=drops, axis=1)

    #change to numpy array
    X_final = X.to_numpy()

    #info
    print("This is an extract of df after preprocessing:")
    print(f"Shape X: {X_final.shape}, Shape y: {y.shape if y is not None else None}")
    print(X.info())

    return X_final, y

In [63]:
# preprocess train.csv
X_train, y_train = preprocess(X_train_input)

sample size: 6954
This is an extract of df after preprocessing:
Shape X: (6954, 18), Shape y: (6954,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6954 entries, 2333 to 7270
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  6954 non-null   int32  
 1   Age                        6954 non-null   float32
 2   VIP                        6954 non-null   int32  
 3   RoomService                6954 non-null   float32
 4   FoodCourt                  6954 non-null   float32
 5   ShoppingMall               6954 non-null   float32
 6   Spa                        6954 non-null   float32
 7   VRDeck                     6954 non-null   float32
 8   Spendings                  6954 non-null   float32
 9   FamilySize                 6954 non-null   float64
 10  C_deck                     6954 non-null   int32  
 11  C_side                     6954 non-null   int32  
 12 

In [64]:
X_test, y_test = preprocess(X_test_input)

sample size: 1739
This is an extract of df after preprocessing:
Shape X: (1739, 18), Shape y: (1739,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1739 entries, 304 to 6093
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  1739 non-null   int32  
 1   Age                        1739 non-null   float32
 2   VIP                        1739 non-null   int32  
 3   RoomService                1739 non-null   float32
 4   FoodCourt                  1739 non-null   float32
 5   ShoppingMall               1739 non-null   float32
 6   Spa                        1739 non-null   float32
 7   VRDeck                     1739 non-null   float32
 8   Spendings                  1739 non-null   float32
 9   FamilySize                 1739 non-null   float64
 10  C_deck                     1739 non-null   int32  
 11  C_side                     1739 non-null   int32  
 12  

In [65]:
# preprocess test.csv

X_prediction = pd.read_csv('../data/raw/test.csv')
X_prediction['Spendings'] = X_prediction['RoomService'] + X_prediction['FoodCourt'] + X_prediction['ShoppingMall'] + X_prediction['Spa'] + X_prediction['VRDeck']
X_prediction, _ = preprocess(X_prediction)

sample size: 4277
This is an extract of df after preprocessing:
Shape X: (4277, 18), Shape y: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   int32  
 1   Age                        4277 non-null   float32
 2   VIP                        4277 non-null   int32  
 3   RoomService                4277 non-null   float32
 4   FoodCourt                  4277 non-null   float32
 5   ShoppingMall               4277 non-null   float32
 6   Spa                        4277 non-null   float32
 7   VRDeck                     4277 non-null   float32
 8   Spendings                  4277 non-null   float32
 9   FamilySize                 4277 non-null   float64
 10  C_deck                     4277 non-null   int32  
 11  C_side                     4277 non-null   int32  
 12  HomeP

In [66]:
np.info(X_test)

class:  ndarray
shape:  (1739, 18)
strides:  (8, 13912)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x562d80b81020
byteorder:  little
byteswap:  False
type: float64


---------------

# Save dataset --version

In [67]:
# prepare folder and info file for dataset versions

version_ID = 'v1_4'
version_note = 'name and cabin added'
timestamp = time.strftime("on %Y_%m_%d at %H_%M")

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')

if not os.path.exists(f'../data/preprocessed/{version_ID}'):
        os.mkdir(f'../data/preprocessed/{version_ID}')

#append to overview logs
overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp}, 
                                ignore_index=True)
overview = overview[{'version_ID', 'version_note', 'timestamp'}]
overview.drop_duplicates(inplace=True)
overview.to_csv('../data/preprocessed/dataset_overview.csv')

# save preprocessed data
np.save(f'../data/preprocessed/{version_ID}/X_train.npy', X_train)
np.save(f'../data/preprocessed/{version_ID}/X_test.npy', X_test)
np.save(f'../data/preprocessed/{version_ID}/y_train.npy', y_train)
np.save(f'../data/preprocessed/{version_ID}/y_test.npy', y_test)
np.save(f'../data/preprocessed/{version_ID}/X_predict.npy', X_prediction)

In [68]:
# see overview of existant datasets

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')
overview

Unnamed: 0.1,Unnamed: 0,version_note,version_ID,timestamp
0,0,Initial version,v1_0,on 2022_09_20 at 17_04
1,1,no more booleans,v1_1,on 2022_09_21 at 21_10
2,2,dtype 32s,v1_2,on 2022_09_21 at 21_38
3,3,dtype 32s,v1_2,on 2022_09_21 at 21_51
4,4,name and cabin added,v1_3,on 2022_09_22 at 04_08
5,5,name and cabin added,v1_4,on 2022_09_22 at 04_43
6,6,name and cabin added,v1_4,on 2022_09_22 at 04_57
7,7,name and cabin added,v1_4,on 2022_09_22 at 05_02


In [69]:
############# use to create new oversion and DELETE OLD VERSIONS #############

# overview = pd.DataFrame(columns=['version_ID', 'version_note', 'timestamp'])
# overview.to_csv('../data/preprocessed/dataset_overview.csv')