# Spaceship Titanic
## Ian Kimura

### Import Libraries

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

### Read the CSV

In [51]:
train = pd.read_csv('/users/iankimura/Desktop/BSDS200/SpaceshipTitanic/train.csv')
test = pd.read_csv('/users/iankimura/Desktop/BSDS200/SpaceshipTitanic/test.csv')

In [52]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [53]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


### Train Preprocessing

In [54]:
# split cabin into deck,num,side
train[['Deck','Num','Side']]=train['Cabin'].str.split('/',expand=True,)
train=train[['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Deck', 'Num', 'Side', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported']]

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [55]:
# columns that aren't necessary for the model
train=train.drop(columns=['PassengerId','Cabin','Name'])
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [56]:
# check for null
train.isnull().sum()

HomePlanet      201
CryoSleep       217
Deck            199
Num             199
Side            199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [57]:
# fill in null
# for categorical columns use mode and for numerical use mean
# Imputing null values

for i in train.columns:
    if train[i].dtypes in ['object','bool']:
        train[i].fillna(train[i].mode()[0],inplace=True)
    else :
        train[i].fillna(train[i].mean(),inplace=True)

train['Num']=train['Num'].astype(int)

train.isnull().sum()

HomePlanet      0
CryoSleep       0
Deck            0
Num             0
Side            0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [58]:
#remove outliers
temp_df1=pd.DataFrame()

for i in train.columns:
    if train[i].dtype not in ('object','bool'):
        temp_df1[i]=train[i]

outlier_df=zscore(temp_df1)
outlier_df['abs_zscore']=outlier_df.abs().max(axis=1)

train=train[train.index.isin(outlier_df[outlier_df['abs_zscore']<=3].index)]

train.head()

Unnamed: 0,HomePlanet,CryoSleep,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
3,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
5,Earth,False,F,0,P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,True


In [59]:
# which columns for one hot encoding

temp_df2=pd.DataFrame()

for i in train.columns:
    if train[i].dtype in ('object','bool') and i!='Transported':
        temp_df2[i]=train[i]
        
temp_df2
        


Unnamed: 0,HomePlanet,CryoSleep,Deck,Side,Destination,VIP
0,Europa,False,B,P,TRAPPIST-1e,False
1,Earth,False,F,S,TRAPPIST-1e,False
3,Europa,False,A,S,TRAPPIST-1e,False
4,Earth,False,F,S,TRAPPIST-1e,False
5,Earth,False,F,P,PSO J318.5-22,False
...,...,...,...,...,...,...
8687,Europa,False,A,P,TRAPPIST-1e,False
8689,Earth,True,G,S,PSO J318.5-22,False
8690,Earth,False,G,S,TRAPPIST-1e,False
8691,Europa,False,E,S,55 Cancri e,False


In [60]:
# one hot encoding
enc=OneHotEncoder()
encoded=pd.DataFrame(enc.fit_transform(temp_df2).toarray(),columns=enc.get_feature_names_out())

temp_df3=pd.DataFrame()

for i in train.columns:
    if train[i].dtype not in ('object','bool') and i!='Transported':
        temp_df3[i]=train[i]
temp_df3.reset_index(inplace=True)

encoded=pd.concat([encoded,temp_df3],axis=1)

encoded.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,VIP_False,VIP_True,index,Num,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0,0,39.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1,0,24.0,109.0,9.0,25.0,549.0,44.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3,0,33.0,0.0,1283.0,371.0,3329.0,193.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4,1,16.0,303.0,70.0,151.0,565.0,2.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5,0,44.0,0.0,483.0,0.0,291.0,0.0


In [61]:
# splitting into training test
# x_train: This contains the features (encoded independent variables) for the training set.
# x_test: This contains the features (encoded independent variables) for the test set.
# y_train: This contains the labels (dependent variable) corresponding to x_train.
# y_test: This contains the labels (dependent variable) corresponding to x_test.

x_train,x_test,y_train,y_test=train_test_split(encoded,train['Transported'],test_size=.2,shuffle=True,random_state=42)

In [62]:
# Modeling the Decision Tree Classifier

d_tree=DecisionTreeClassifier(max_depth=10)
d_tree.fit(x_train,y_train)
y_pred_dt=pd.DataFrame(d_tree.predict(x_test),columns=['Transported'])
print('classification report for Decision Tree Classifier \n',classification_report(y_pred_dt,y_test))

classification report for Decision Tree Classifier 
               precision    recall  f1-score   support

       False       0.77      0.74      0.75       820
        True       0.73      0.77      0.75       768

    accuracy                           0.75      1588
   macro avg       0.75      0.75      0.75      1588
weighted avg       0.75      0.75      0.75      1588



In [63]:
# Modeling the Logistic Regression

lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred_lr=pd.DataFrame(lr.predict(x_test),columns=['Transported'])
print('classification report for Logistic Regression \n',classification_report(y_pred_lr,y_test))

classification report for Logistic Regression 
               precision    recall  f1-score   support

       False       0.75      0.81      0.78       729
        True       0.82      0.77      0.80       859

    accuracy                           0.79      1588
   macro avg       0.79      0.79      0.79      1588
weighted avg       0.79      0.79      0.79      1588



In [64]:
# Modeling the Random Forest Classifier

rf=RandomForestClassifier(n_estimators=5,max_depth=10)
rf.fit(x_train,y_train)
y_pred_rf=pd.DataFrame(rf.predict(x_test),columns=['Transported'])
print('classification report for Random Forest Classifier \n',classification_report(y_pred_rf,y_test))

classification report for Random Forest Classifier 
               precision    recall  f1-score   support

       False       0.79      0.78      0.78       797
        True       0.78      0.79      0.78       791

    accuracy                           0.78      1588
   macro avg       0.78      0.78      0.78      1588
weighted avg       0.78      0.78      0.78      1588



### Test dataframe 

In [65]:
# Splitting cabin in deck,num,side in testing dataset and formatting them

test[['Deck','Num','Side']]=test['Cabin'].str.split('/',expand=True,)
test=test[['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Deck', 'Num', 'Side', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name']]

test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,G,3,S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,F,4,S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,C,0,S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,C,1,S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,F,5,S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [66]:
final_test = test
final_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,G,3,S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,F,4,S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,C,0,S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,C,1,S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,F,5,S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [67]:
# Dropping redundant columns in testing dataset

test=test.drop(columns=['PassengerId','Cabin','Name'])
test.head()

Unnamed: 0,HomePlanet,CryoSleep,Deck,Num,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,G,3,S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F,4,S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,C,0,S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,C,1,S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,F,5,S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0


In [68]:
# Checking for null valuesin testing dataset

test.isnull().sum()

HomePlanet       87
CryoSleep        93
Deck            100
Num             100
Side            100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [69]:
# Imputing null values in testing dataset

for i in test.columns:
    if test[i].dtypes in ['object','bool']:
        test[i].fillna(test[i].mode()[0],inplace=True)
    else :
        test[i].fillna(test[i].mean(),inplace=True)

test['Num']=test['Num'].astype(int)
test.isnull().sum()

HomePlanet      0
CryoSleep       0
Deck            0
Num             0
Side            0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [70]:
# Identifying object columns for encoding in testing dataset

temp_test_df=pd.DataFrame()

for i in test.columns:
    if test[i].dtype in ('object','bool'):
        temp_test_df[i]=test[i]
temp_test_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Deck,Side,Destination,VIP
0,Earth,True,G,S,TRAPPIST-1e,False
1,Earth,False,F,S,TRAPPIST-1e,False
2,Europa,True,C,S,55 Cancri e,False
3,Europa,False,C,S,TRAPPIST-1e,False
4,Earth,False,F,S,TRAPPIST-1e,False


In [71]:
# Encoding the object columns for modeling

enc_test=OneHotEncoder()
enc_df_test=pd.DataFrame(enc_test.fit_transform(temp_test_df).toarray(),columns=enc_test.get_feature_names_out())

temp_test_df1=pd.DataFrame()

for i in test.columns:
    if test[i].dtype not in ('object','bool') and i!='Transported':
        temp_test_df1[i]=test[i]
temp_test_df1.reset_index(inplace=True)

enc_df_test=pd.concat([enc_df_test,temp_test_df1],axis=1)

enc_df_test=enc_df_test[enc_df_test.columns]

enc_df_test.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,VIP_False,VIP_True,index,Num,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0,3,27.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1,4,19.0,0.0,9.0,0.0,2823.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,2,0,31.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,3,1,38.0,0.0,6652.0,0.0,181.0,585.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4,5,20.0,10.0,0.0,635.0,0.0,0.0


In [73]:
# Using Logistic Regression model to classify the testing dataset

y_pred_test_lr=pd.DataFrame(lr.predict(enc_df_test),columns=['Transported'])

final_prediction=final_test[['PassengerId']]
final_prediction['Transported']=y_pred_test_lr

final_prediction.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [74]:
# Final prediction submission

final_prediction.to_csv(r'/users/iankimura/Desktop/BSDS200/SpaceshipTitanic/submission.csv',index=False)