In [1]:
import kaggle
import zipfile
import pandas as pd
import numpy as np

In [2]:
# Import data files from the Kaggle API
!kaggle competitions download -c spaceship-titanic

# Unzip the data files
with zipfile.ZipFile("spaceship-titanic.zip", 'r') as zip_ref:
    zip_ref.extractall()

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Read the data files into pandas dataframes
df_train = pd.read_csv("train.csv")

# Preview the data
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df_train.shape

(8693, 14)

In [5]:
df_train.describe(include=['O'])

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8693,8492,8476,8494,8511,8490,8493
unique,8693,3,2,6560,3,2,8473
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,1,4602,5439,8,5915,8291,2


In [6]:
# Get the number of missing data points per column
missing_values_count = df_train.isnull().sum()

missing_values_count

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Home Planet column cleaning

Categorical feature

In [7]:
df_train.HomePlanet.value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [8]:
df_train[['HomePlanet', 'Transported']].groupby(['HomePlanet'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,HomePlanet,Transported
1,Europa,0.658846
2,Mars,0.523024
0,Earth,0.423946


We observe significant correlation. We decide to include this feature in our model.

In [9]:
# Convert HomePlanet features to numerical values
df_train['HomePlanet']=df_train['HomePlanet'].replace({'Europa':0, 'Earth':1, 'Mars':1})
df_train['HomePlanet'].value_counts()

1.0    6361
0.0    2131
Name: HomePlanet, dtype: int64

## VIP column cleaning

Categorical feature

In [10]:
df_train["VIP"].value_counts()

False    8291
True      199
Name: VIP, dtype: int64

In [11]:
df_train[['VIP', 'Transported']].groupby(['VIP'], as_index=False).mean().sort_values(by='Transported', ascending=False)

Unnamed: 0,VIP,Transported
0,False,0.506332
1,True,0.38191


#### Chi-square test

In [12]:
# Cross tabulation between VIP and Transported
CrosstabResult=pd.crosstab(index=df_train['VIP'],columns=df_train['Transported'])
print(CrosstabResult)
 
# importing the required function
from scipy.stats import chi2_contingency
 
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
 
# H0: The variables are not correlated with each other. This is the H0 used in the Chi-square test.
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
 
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

Transported  False  True
VIP                     
False         4093  4198
True           123    76
The P-Value of the ChiSq Test is: 0.0006804064556968345


The P-value came lower than 0.05. Hence H0 wont be accepted. Which means the variables are correlated with each other. We decide to include this feature in our model.

In [13]:
# Convert VIP feature to a column of integers 1 or 0.
df_train['VIP'] = (df_train['VIP'] == True).astype(int)

### Missing values

In [14]:
df_train = df_train.select_dtypes(include=[np.number]).interpolate().dropna()
df_train.shape

(8693, 8)

In [15]:
df_train.head()

Unnamed: 0,HomePlanet,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.0,39.0,0,0.0,0.0,0.0,0.0,0.0
1,1.0,24.0,0,109.0,9.0,25.0,549.0,44.0
2,0.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,0.0,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,1.0,16.0,0,303.0,70.0,151.0,565.0,2.0
