# Kaggle - Spaceship Titanic
DATA PREPROCESSING

In [27]:
#import libraries
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
from random import shuffle
import os
from os import path
import time

### Preprocess data for Training and Model Selection (Train & Validation & Test)

TRAIN.SCV

In [28]:
#import dataset
X = pd.read_csv('../data/raw/train.csv')

#split dataset into train and test
X_train_input, X_test_input = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

#get parameters for normalization --> avoid data leekage
norm_paras = X_train_input.agg(['mean', 'std', 'median'])
print(norm_paras)

        PassengerId  CryoSleep        Age       VIP  RoomService    FoodCourt  \
mean            inf   0.354139  28.828093  0.022968   222.822935   456.714705   
std             NaN   0.478287  14.446399  0.149813   674.900407  1574.797221   
median     461251.0   0.000000  27.000000  0.000000     0.000000     0.000000   

        ShoppingMall          Spa       VRDeck  Transported  
mean      175.499043   315.693842   304.189769     0.503307  
std       613.969158  1118.074541  1170.639327     0.500025  
median      0.000000     0.000000     0.000000     1.000000  


  norm_paras = X_train_input.agg(['mean', 'std', 'median'])


Preprocessing pipeline

In [29]:
def preprocess(X, norm_paras=norm_paras):

    #extract y
    if 'Transported' in X.columns:
        y = X['Transported'].to_numpy()
        X = X.drop(labels="Transported", axis=1)
    else:    y = None
    
    #drop columns
    drops = ['PassengerId', 'Name', 'Cabin']
    X_reduced = X.drop(labels=drops, axis=1)

    #one-hot-encode categorical variables
    one_hot = ['HomePlanet', 'Destination']
    for col in one_hot:
        X_reduced[col].fillna('unknown', inplace=True)
    X_encoded = pd.get_dummies(X_reduced, columns=one_hot)

    #fill missing values for binary variables
    binaries = ['CryoSleep', 'VIP']
    for col in binaries:
        X_encoded[col].fillna(False, inplace=True)
        X_encoded[col] = np.int32(X_encoded[col])

    #normalize numerical variables and fill missing values
    numerical = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in numerical:
        X_encoded[col].fillna(norm_paras[col]['median'], inplace=True)
        X_encoded[col] = (X_encoded[col] - norm_paras[col]['mean']) / norm_paras[col]['std']
        X_encoded[col] = np.float32(X_encoded[col])

    #change to numpy array
    X_final = X_encoded.to_numpy()

    #info
    print("This is an extract of df after preprocessing:")
    print(f"Shape X: {X_final.shape}, Shape y: {y.shape if y is not None else None}")
    print(X_encoded.info())

    return X_final, y

In [30]:
# preprocess train.csv
X_train, y_train = preprocess(X_train_input)

This is an extract of df after preprocessing:
Shape X: (6954, 16), Shape y: (6954,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6954 entries, 2333 to 7270
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  6954 non-null   int32  
 1   Age                        6954 non-null   float32
 2   VIP                        6954 non-null   int32  
 3   RoomService                6954 non-null   float32
 4   FoodCourt                  6954 non-null   float32
 5   ShoppingMall               6954 non-null   float32
 6   Spa                        6954 non-null   float32
 7   VRDeck                     6954 non-null   float32
 8   HomePlanet_Earth           6954 non-null   uint8  
 9   HomePlanet_Europa          6954 non-null   uint8  
 10  HomePlanet_Mars            6954 non-null   uint8  
 11  HomePlanet_unknown         6954 non-null   uint8  
 12  Destination_55 Ca

In [31]:
X_test, y_test = preprocess(X_test_input)

This is an extract of df after preprocessing:
Shape X: (1739, 16), Shape y: (1739,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1739 entries, 304 to 6093
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  1739 non-null   int32  
 1   Age                        1739 non-null   float32
 2   VIP                        1739 non-null   int32  
 3   RoomService                1739 non-null   float32
 4   FoodCourt                  1739 non-null   float32
 5   ShoppingMall               1739 non-null   float32
 6   Spa                        1739 non-null   float32
 7   VRDeck                     1739 non-null   float32
 8   HomePlanet_Earth           1739 non-null   uint8  
 9   HomePlanet_Europa          1739 non-null   uint8  
 10  HomePlanet_Mars            1739 non-null   uint8  
 11  HomePlanet_unknown         1739 non-null   uint8  
 12  Destination_55 Can

In [32]:
# preprocess test.csv

X_prediction = pd.read_csv('../data/raw/test.csv')
X_prediction, _ = preprocess(X_prediction)

This is an extract of df after preprocessing:
Shape X: (4277, 16), Shape y: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   int32  
 1   Age                        4277 non-null   float32
 2   VIP                        4277 non-null   int32  
 3   RoomService                4277 non-null   float32
 4   FoodCourt                  4277 non-null   float32
 5   ShoppingMall               4277 non-null   float32
 6   Spa                        4277 non-null   float32
 7   VRDeck                     4277 non-null   float32
 8   HomePlanet_Earth           4277 non-null   uint8  
 9   HomePlanet_Europa          4277 non-null   uint8  
 10  HomePlanet_Mars            4277 non-null   uint8  
 11  HomePlanet_unknown         4277 non-null   uint8  
 12  Destination_55 Cancri e

In [39]:
np.info(X_test)

class:  ndarray
shape:  (1739, 16)
strides:  (8, 13912)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x55d7d6086820
byteorder:  little
byteswap:  False
type: float64


---------------

# Save dataset --version

In [33]:
# prepare folder and info file for dataset versions

version_ID = 'v1_2'
version_note = 'dtype 32s'
timestamp = time.strftime("on %Y_%m_%d at %H_%M")

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')

if not os.path.exists(f'../data/preprocessed/{version_ID}'):
        os.mkdir(f'../data/preprocessed/{version_ID}')

#append to overview logs
overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp}, 
                                ignore_index=True)
overview = overview[{'version_ID', 'version_note', 'timestamp'}]
overview.drop_duplicates(inplace=True)
overview.to_csv('../data/preprocessed/dataset_overview.csv')

# save preprocessed data
np.save(f'../data/preprocessed/{version_ID}/X_train.npy', X_train)
np.save(f'../data/preprocessed/{version_ID}/X_test.npy', X_test)
np.save(f'../data/preprocessed/{version_ID}/y_train.npy', y_train)
np.save(f'../data/preprocessed/{version_ID}/y_test.npy', y_test)
np.save(f'../data/preprocessed/{version_ID}/X_predict.npy', X_prediction)

  overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp},
  overview = overview[{'version_ID', 'version_note', 'timestamp'}]


In [34]:
# see overview of existant datasets

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')
overview

Unnamed: 0.1,Unnamed: 0,timestamp,version_note,version_ID
0,0,on 2022_09_20 at 17_04,Initial version,v1_0
1,1,on 2022_09_21 at 21_10,no more booleans,v1_1
2,2,on 2022_09_21 at 21_38,dtype 32s,v1_2
3,3,on 2022_09_21 at 21_51,dtype 32s,v1_2


In [35]:
############# use to create new oversion and DELETE OLD VERSIONS #############

# overview = pd.DataFrame(columns=['version_ID', 'version_note', 'timestamp'])
# overview.to_csv('../data/preprocessed/dataset_overview.csv')