In [72]:
#Imports
import pandas as pd

In [73]:
# Load the Files
sample_submission = pd.read_csv('./spaceship-titanic/sample_submission.csv')
test = pd.read_csv('./spaceship-titanic/test.csv')
train = pd.read_csv('./spaceship-titanic/train.csv')

In [74]:
# Test the File Imports
print("Sample Submission:")
print(sample_submission.head())
print("Test:")
print(test.head())
print("Train:")
print(train.head())

Sample Submission:
  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01        False
3     0021_01        False
4     0023_01        False
Test:
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0013_01      Earth      True  G/3/S  TRAPPIST-1e  27.0  False   
1     0018_01      Earth     False  F/4/S  TRAPPIST-1e  19.0  False   
2     0019_01     Europa      True  C/0/S  55 Cancri e  31.0  False   
3     0021_01     Europa     False  C/1/S  TRAPPIST-1e  38.0  False   
4     0023_01      Earth     False  F/5/S  TRAPPIST-1e  20.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck              Name  
0          0.0        0.0           0.0     0.0     0.0   Nelly Carsoning  
1          0.0        9.0           0.0  2823.0     0.0    Lerome Peckers  
2          0.0        0.0           0.0     0.0     0.0   Sabih Unhearfus  
3          0.0     6652.0           0.0   181.0   585.0  Meratz Caltilter  
4     

In [75]:
# Copying Data
train_clean = train.copy()
test_clean = test.copy()

In [76]:
# Getting Simple data and Simplifying it

In [77]:
train_clean['HomePlanet'].value_counts(normalize=True)

HomePlanet
Earth     0.541922
Europa    0.250942
Mars      0.207136
Name: proportion, dtype: float64

In [78]:
train_clean['CryoSleep'].value_counts(normalize=True)

CryoSleep
False    0.641694
True     0.358306
Name: proportion, dtype: float64

In [79]:
train_clean['Destination'].value_counts(normalize=True)

Destination
TRAPPIST-1e      0.694983
55 Cancri e      0.211491
PSO J318.5-22    0.093526
Name: proportion, dtype: float64

In [80]:
train_clean['VIP'].value_counts(normalize=True)

VIP
False    0.976561
True     0.023439
Name: proportion, dtype: float64

In [81]:
# Fill NaN(Booleans) of Boolean Data
train_clean.isna().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

In [82]:
# Filling Nan(Booleans) as Unknown then making it as -1
fill_nan_cols = ['CryoSleep', 'VIP', 'Transported']

for col in fill_nan_cols:
    train_clean[col] = train_clean[col].fillna('Unknown')
    train_clean[col] = train_clean[col].map({True: 1, False: 0, 'Unknown': -1})

In [83]:
# Fill NaN(Strings) Cols for HomePlanet
fill_nan_cols = ['HomePlanet','Destination']

for col in fill_nan_cols:
    train_clean[col] = train_clean[col].fillna('Unknown')

train_clean = pd.get_dummies(train_clean,columns=fill_nan_cols,drop_first=True)

In [84]:
# Check Tables
train_clean.head()

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown
0,0001_01,0,B/0/P,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,True,False,False,False,True,False
1,0002_01,0,F/0/S,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,False,False,False,False,True,False
2,0003_01,0,A/0/S,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,True,False,False,False,True,False
3,0003_02,0,A/0/S,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,True,False,False,False,True,False
4,0004_01,0,F/1/S,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,False,False,False,False,True,False


In [85]:
# Check NaN
train_clean.isna().sum().sort_values(ascending=False)

ShoppingMall                 208
Name                         200
Cabin                        199
VRDeck                       188
FoodCourt                    183
Spa                          183
RoomService                  181
Age                          179
HomePlanet_Mars                0
Destination_TRAPPIST-1e        0
Destination_PSO J318.5-22      0
HomePlanet_Unknown             0
PassengerId                    0
HomePlanet_Europa              0
Transported                    0
CryoSleep                      0
VIP                            0
Destination_Unknown            0
dtype: int64

In [86]:
# Fill Price Values
fill_nan_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in fill_nan_cols:
    train_clean[col] = train_clean[col].fillna(0)

In [87]:
# Check NaN
train_clean.isna().sum().sort_values(ascending=False)

Name                         200
Cabin                        199
Age                          179
PassengerId                    0
Destination_TRAPPIST-1e        0
Destination_PSO J318.5-22      0
HomePlanet_Unknown             0
HomePlanet_Mars                0
HomePlanet_Europa              0
Transported                    0
VRDeck                         0
CryoSleep                      0
Spa                            0
ShoppingMall                   0
FoodCourt                      0
RoomService                    0
VIP                            0
Destination_Unknown            0
dtype: int64

In [88]:
# Age Median
train_clean['Age'] = train_clean['Age'].fillna(
    train_clean['Age'].median()
)

In [89]:
# Treat Cabin
train_clean['Cabin'] = train_clean['Cabin'].fillna(
    'Unknown/Unknown/Unknown'
)
train_clean[['Deck', 'CabinNum', 'Side']] = (
    train_clean['Cabin'].str.split('/', expand=True)
)
train_clean.drop(columns=['Cabin'], inplace=True)

In [90]:
# Name Dropping
train_clean.drop(columns=['Name'], inplace=True)

In [91]:
train_clean.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown,Deck,CabinNum,Side
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,True,False,False,False,True,False,B,0,P
1,0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,False,False,False,False,True,False,F,0,S
2,0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,True,False,False,False,True,False,A,0,S
3,0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,True,False,False,False,True,False,A,0,S
4,0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,False,False,False,False,True,False,F,1,S


In [92]:
train_clean.isna().sum().sort_values(ascending=False)

PassengerId                  0
HomePlanet_Europa            0
CabinNum                     0
Deck                         0
Destination_Unknown          0
Destination_TRAPPIST-1e      0
Destination_PSO J318.5-22    0
HomePlanet_Unknown           0
HomePlanet_Mars              0
Transported                  0
CryoSleep                    0
VRDeck                       0
Spa                          0
ShoppingMall                 0
FoodCourt                    0
RoomService                  0
VIP                          0
Age                          0
Side                         0
dtype: int64