# Data Preparation

In [1]:
import numpy as np
import pandas as pd

## Load CSV files and any other tools

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
tfMap = { np.nan: -1, False: 0, True: 1 }

df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


## Mapping HomePlanet

In [3]:
homePlanetMap = { np.nan: 0, 'Earth': 1, 'Mars': 2, 'Europa': 3 }
# Maps all HomePlanet options into their respective values
df_train['HomePlanet'] = [homePlanetMap[i] for i in df_train['HomePlanet']]
df_test['HomePlanet'] = [homePlanetMap[i] for i in df_test['HomePlanet']]

## Mapping CryoSleep

In [4]:
df_train['CryoSleep'] = [tfMap[i] for i in df_train['CryoSleep']]
df_test['CryoSleep'] = [tfMap[i] for i in df_test['CryoSleep']]

## Splitting Cabin into Deck, Num, and Side + Mapping 

In [5]:
deckDict = { 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8 }
sideDict = { 'P': 1, 'S': 2 }

deckList = []
numList = []
sideList = []

def sortCabinInfo(cabin):
    if cabin is not np.nan:
        info = cabin.split('/')
        deckList.append(deckDict[info[0]])
        numList.append(int(info[1]))
        sideList.append(sideDict[info[2]])
    else:
        # Null value case
        deckList.append(0)
        numList.append(0)
        sideList.append(0)

# Map train values
for cabin in df_train['Cabin']:
    sortCabinInfo(cabin)

df_train.drop(columns='Cabin', inplace=True)
df_train.insert(3, column='Side', value=sideList)
df_train.insert(3, column='Num', value=numList)
df_train.insert(3, column='Deck', value=deckList)

# Map test values
deckList = []
numList = []
sideList = []

for cabin in df_test['Cabin']:
    sortCabinInfo(cabin)

df_test.drop(columns='Cabin', inplace=True)
df_test.insert(3, column='Side', value=sideList)
df_test.insert(3, column='Num', value=numList)
df_test.insert(3, column='Deck', value=deckList)

## Mapping Destination

In [6]:
destinationMap = { np.nan: 0, '55 Cancri e': 1, 'PSO J318.5-22': 2, 'TRAPPIST-1e': 3 }
df_train['Destination'] = [destinationMap[i] for i in df_train['Destination']]
df_test['Destination'] = [destinationMap[i] for i in df_test['Destination']]

## Mapping VIP

In [7]:
df_train['VIP'] = [tfMap[i] for i in df_train['VIP']]
df_test['VIP'] = [tfMap[i] for i in df_test['VIP']]

## Splitting Name into FirstName and LastName + Mapping

In [8]:
trainFirstNameList = []
trainLastNameList = []
testFirstNameList = []
testLastNameList = []

def getNameInfo(name, firstNameList, lastNameList):
    if name is not np.nan:
        info = name.split(' ')
        firstNameList.append(info[0])
        lastNameList.append(info[1])
    else:
        # Null value case
        firstNameList.append(np.nan)
        lastNameList.append(np.nan)

for name in df_train['Name']:
    getNameInfo(name, trainFirstNameList, trainLastNameList)

for name in df_test['Name']:
    getNameInfo(name, testFirstNameList, testLastNameList)

combinedFirstNameList = trainFirstNameList + testFirstNameList
combinedLastNameList = trainLastNameList + testLastNameList

x = list(set(combinedFirstNameList))
firstNameDict = dict(zip(x, list(range(1, len(x)+1))))
x = list(set(combinedLastNameList))
lastNameDict = dict(zip(x, list(range(1, len(x)+1))))

df_train.drop(columns='Name', inplace=True)
df_train.insert(12, column='LastName', value=[lastNameDict[i] for i in trainLastNameList])
df_train.insert(12, column='FirstName', value=[firstNameDict[i] for i in trainFirstNameList])

df_test.drop(columns='Name', inplace=True)
df_test.insert(12, column='LastName', value=[lastNameDict[i] for i in testLastNameList])
df_test.insert(12, column='FirstName', value=[firstNameDict[i] for i in testFirstNameList])

## Mapping Transported

In [9]:
df_train['Transported'] = [tfMap[i] for i in df_train['Transported']]
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,FirstName,LastName,Spa,VRDeck,Transported
0,0001_01,3,0,2,0,1,3,39.0,0,0.0,0.0,0.0,954,354,0.0,0.0,0
1,0002_01,1,0,6,0,2,3,24.0,0,109.0,9.0,25.0,2765,320,549.0,44.0,1
2,0003_01,3,0,1,0,2,3,58.0,1,43.0,3576.0,0.0,1279,1122,6715.0,49.0,0
3,0003_02,3,0,1,0,2,3,33.0,0,0.0,1283.0,371.0,362,1122,3329.0,193.0,0
4,0004_01,1,0,6,1,2,3,16.0,0,303.0,70.0,151.0,939,246,565.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,3,0,1,98,1,1,41.0,1,0.0,6819.0,0.0,107,840,1643.0,74.0,0
8689,9278_01,1,1,7,1499,2,2,18.0,0,0.0,0.0,0.0,595,1502,0.0,0.0,0
8690,9279_01,1,0,7,1500,2,3,26.0,0,0.0,0.0,1872.0,2123,1652,1.0,0.0,1
8691,9280_01,3,0,5,608,2,1,32.0,0,0.0,1049.0,0.0,772,1933,353.0,3235.0,0


## Check if there are remaining null values

In [10]:
df_train.isna().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Deck              0
Num               0
Side              0
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
FirstName         0
LastName          0
Spa             183
VRDeck          188
Transported       0
dtype: int64

## Fill null values

In [11]:
df_train.fillna(value=-1, inplace=True)
df_test.fillna(value=-1, inplace=True)
df_train.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Deck            0
Num             0
Side            0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
FirstName       0
LastName        0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [12]:
df_train.to_csv('data/trainClean.csv', index=False)
df_test.to_csv('data/testClean.csv', index=False)