# Spaceship Titanic Competition
### By: MaryGrace Kane

In [125]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


### Checking which columns have null values

In [126]:
train_data.isnull().any()

PassengerId     False
HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age              True
VIP              True
RoomService      True
FoodCourt        True
ShoppingMall     True
Spa              True
VRDeck           True
Name             True
Transported     False
dtype: bool

### Splitting X & y

In [127]:
X = train_data.drop('Transported', axis=1)
y = train_data['Transported']

### Splitting Cabin Column into 3 Columns

In [137]:
X[['deck', 'cabin_num', 'side']] = X_train['Cabin'].str.split('/', expand=True)
X.drop('Cabin', axis=1, inplace=True)

KeyError: "['Cabin'] not found in axis"

### Splitting into X_train, y_train, X_valid, y_valid

In [138]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 1)

### Separating Numerical & Categorical Columns

In [139]:
cols_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_missing_cat = [col for col in X_train.columns if (X_train[col].dtypes == object) & (col in cols_missing)]
cols_missing_num = [col for col in X_train.columns if (X_train[col].dtypes != object) & (col in cols_missing)]

### Imputing Numerical Columns

In [140]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()

imputer_X_train = pd.DataFrame(imputer.fit_transform(X_train[cols_missing_num]))
imputer_X_train.columns = cols_missing_num

imputer_X_valid = pd.DataFrame(imputer.transform(X_valid[cols_missing_num]))
imputer_X_valid.columns = cols_missing_num

### Checking Number of Unique Values per Categorical Column

In [141]:
object_nunique = list(map(lambda col: X_train[col].nunique(), cols_missing_cat))
d = dict(zip(cols_missing_cat, object_nunique))

sorted(d.items(), key=lambda x: x[1])

[('CryoSleep', 2),
 ('VIP', 2),
 ('side', 2),
 ('HomePlanet', 3),
 ('Destination', 3),
 ('deck', 8),
 ('cabin_num', 1714),
 ('Name', 6358)]

### Removing categorical columns that can't be imputed

In [142]:
cols_missing_cat.remove('Name')

In [143]:
cols_missing_cat

['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'cabin_num', 'side']

### OneHotEncoding Categorical Columns

In [144]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cols_missing_cat]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[cols_missing_cat]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()

# num_X_train = X_train.drop(cols_missing_cat, axis=1)
# num_X_valid = X_valid.drop(cols_missing_cat, axis=1)

### Concatenating Imputed & OneHotEncoded Dataframe

In [145]:
OH_X_train = pd.concat([imputer_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([imputer_X_valid, OH_cols_valid], axis=1)

In [146]:
OH_X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,...,cabin_num_993,cabin_num_994,cabin_num_995,cabin_num_997,cabin_num_998,cabin_num_999,cabin_num_nan,side_P,side_S,side_nan
0,28.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,12.0,0.0,0.0,0.0,0.0,308.308127,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,27.0,994.0,0.0,26.0,1.0,0.000000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,40.0,0.0,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6542,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8093,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8444,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7751,,,,,,,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [147]:
imputer_X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,28.0,0.000000,0.0,0.000000,0.0,0.000000
1,12.0,0.000000,0.0,0.000000,0.0,308.308127
2,27.0,994.000000,0.0,26.000000,1.0,0.000000
3,2.0,0.000000,0.0,0.000000,0.0,0.000000
4,40.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...
6514,38.0,886.000000,0.0,618.000000,0.0,0.000000
6515,39.0,0.000000,261.0,170.119792,1831.0,1797.000000
6516,24.0,229.216969,32.0,8.000000,0.0,0.000000
6517,47.0,0.000000,0.0,0.000000,0.0,0.000000
