In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels as sm
import seaborn as sns

sns.set_theme(style="whitegrid")

In [36]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


## Cabins

In [37]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.extract(r'^(\w)/(\d+)/(\w)$')
df['Deck'] = df['Deck'].fillna('0')
df['Num'] = df['Num'].fillna('0').astype(int)
df['Side'] = df['Side'].fillna(df['Side'].mode())
df['RoomMates'] = df.groupby('Cabin')['Cabin'].transform('count').fillna(0).astype(int)
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,RoomMates
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S,2


## Category Encoding

In [38]:
from sklearn.preprocessing import LabelEncoder

planet_encoder = LabelEncoder()
dest_encoder = LabelEncoder()
deck_encoder = LabelEncoder()
side_encoder = LabelEncoder()
df['HomePlanet'] = planet_encoder.fit_transform(df['HomePlanet'].fillna(df['HomePlanet'].mode()[0]))
df['CryoSleep'] = df['CryoSleep'].fillna(df['CryoSleep'].mode()[0]).astype(int)
df['Destination'] = dest_encoder.fit_transform(df['Destination'].fillna(df['Destination'].mode()[0]))
df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0]).astype(int)
df['Transported'] = df['Transported'].fillna(df['Transported'].mode()[0]).astype(int)
df['Deck'] = deck_encoder.fit_transform(df['Deck'])
df['Side'] = side_encoder.fit_transform(df['Side'])
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,RoomMates
0,0001_01,1,0,B/0/P,2,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,2,0,0,1
1,0002_01,0,0,F/0/S,2,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,6,0,1,1
2,0003_01,1,0,A/0/S,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,1,0,1,2
3,0003_02,1,0,A/0/S,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,1,0,1,2
4,0004_01,0,0,F/1/S,2,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,6,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,1,0,A/98/P,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,1,98,0,1
8689,9278_01,0,1,G/1499/S,1,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,7,1499,1,1
8690,9279_01,0,0,G/1500/S,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,7,1500,1,1
8691,9280_01,1,0,E/608/S,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,5,608,1,2


## Age

In [39]:
df['Age'] = df['Age'].fillna(df['Age'].mean()).astype(int)
df['AgeRange'] = pd.cut(df['Age'], bins=[0, 20, 50, 100], include_lowest=True, labels=[0, 1, 2])
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,RoomMates,AgeRange
0,0001_01,1,0,B/0/P,2,39,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,2,0,0,1,1
1,0002_01,0,0,F/0/S,2,24,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,6,0,1,1,1
2,0003_01,1,0,A/0/S,2,58,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,1,0,1,2,2
3,0003_02,1,0,A/0/S,2,33,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,1,0,1,2,1
4,0004_01,0,0,F/1/S,2,16,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,6,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,1,0,A/98/P,0,41,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,1,98,0,1,1
8689,9278_01,0,1,G/1499/S,1,18,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,7,1499,1,1,0
8690,9279_01,0,0,G/1500/S,2,26,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,7,1500,1,1,1
8691,9280_01,1,0,E/608/S,0,32,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,5,608,1,2,1


## Family

In [40]:
df['FamilySize'] = df['PassengerId'].str.extract(r'^(\d{4})').groupby(0)[0].transform('count')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,RoomMates,AgeRange,FamilySize
0,0001_01,1,0,B/0/P,2,39,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,2,0,0,1,1,1
1,0002_01,0,0,F/0/S,2,24,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,6,0,1,1,1,1
2,0003_01,1,0,A/0/S,2,58,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,1,0,1,2,2,2
3,0003_02,1,0,A/0/S,2,33,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,1,0,1,2,1,2
4,0004_01,0,0,F/1/S,2,16,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,6,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,1,0,A/98/P,0,41,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,1,98,0,1,1,1
8689,9278_01,0,1,G/1499/S,1,18,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,7,1499,1,1,0,1
8690,9279_01,0,0,G/1500/S,2,26,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,7,1500,1,1,1,1
8691,9280_01,1,0,E/608/S,0,32,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,5,608,1,2,1,2


## Shopping

In [41]:
df['RoomService'] = np.log10(df['RoomService'].fillna(0) + 1)
df['FoodCourt'] = np.log10(df['FoodCourt'].fillna(0) + 1)
df['ShoppingMall'] = np.log10(df['ShoppingMall'].fillna(0) + 1)
df['Spa'] = np.log10(df['Spa'].fillna(0) + 1)
df['VRDeck'] = np.log10(df['VRDeck'].fillna(0) + 1)
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,RoomMates,AgeRange,FamilySize
0,0001_01,1,0,B/0/P,2,39,0,0.000000,0.000000,0.000000,0.000000,0.000000,Maham Ofracculy,0,2,0,0,1,1,1
1,0002_01,0,0,F/0/S,2,24,0,2.041393,1.000000,1.414973,2.740363,1.653213,Juanna Vines,1,6,0,1,1,1,1
2,0003_01,1,0,A/0/S,2,58,1,1.643453,3.553519,0.000000,3.827111,1.698970,Altark Susent,0,1,0,1,2,2,2
3,0003_02,1,0,A/0/S,2,33,0,0.000000,3.108565,2.570543,3.522444,2.287802,Solam Susent,0,1,0,1,2,1,2
4,0004_01,0,0,F/1/S,2,16,0,2.482874,1.851258,2.181844,2.752816,0.477121,Willy Santantines,1,6,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,1,0,A/98/P,0,41,1,0.000000,3.833784,0.000000,3.215902,1.875061,Gravior Noxnuther,0,1,98,0,1,1,1
8689,9278_01,0,1,G/1499/S,1,18,0,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta Mondalley,0,7,1499,1,1,0,1
8690,9279_01,0,0,G/1500/S,2,26,0,0.000000,0.000000,3.272538,0.301030,0.000000,Fayey Connon,1,7,1500,1,1,1,1
8691,9280_01,1,0,E/608/S,0,32,0,0.000000,3.021189,0.000000,2.549003,3.510009,Celeon Hontichre,0,5,608,1,2,1,2


## Remove Columns

In [42]:
dff = df.drop(columns=['PassengerId', 'Cabin', 'Age', 'Name'])
dff

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side,RoomMates,AgeRange,FamilySize
0,1,0,2,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,2,0,0,1,1,1
1,0,0,2,0,2.041393,1.000000,1.414973,2.740363,1.653213,1,6,0,1,1,1,1
2,1,0,2,1,1.643453,3.553519,0.000000,3.827111,1.698970,0,1,0,1,2,2,2
3,1,0,2,0,0.000000,3.108565,2.570543,3.522444,2.287802,0,1,0,1,2,1,2
4,0,0,2,0,2.482874,1.851258,2.181844,2.752816,0.477121,1,6,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,1,0.000000,3.833784,0.000000,3.215902,1.875061,0,1,98,0,1,1,1
8689,0,1,1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,7,1499,1,1,0,1
8690,0,0,2,0,0.000000,0.000000,3.272538,0.301030,0.000000,1,7,1500,1,1,1,1
8691,1,0,0,0,0.000000,3.021189,0.000000,2.549003,3.510009,0,5,608,1,2,1,2


## Test Predictions

In [43]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(dff, test_size=.2)
X_train_df = df_train.drop(columns=['Transported'])
y_train_df = df_train[['Transported']]
X_test_df = df_test.drop(columns=['Transported'])
y_test_df = df_test[['Transported']]

In [44]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=100, n_estimators=200, n_jobs=-1)
forest.fit(X_train_df, y_train_df)

  forest.fit(X_train_df, y_train_df)


In [33]:
forest.score(X_test_df, y_test_df)

0.8004600345025877

In [14]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

tree = DecisionTreeClassifier(max_depth=100)
tree.fit(X_train_df, y_train_df)

In [15]:
tree.score(X_test_df, y_test_df)

0.7326049453709028

In [16]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(Cs=100, tol=1e-6, max_iter=10000, solver='liblinear', n_jobs=-1, verbose=True)
lr.fit(X_train_df, y_train_df.to_numpy().ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


iter  1 act 2.226e-04 pre 2.225e-04 delta 7.228e-05 f 3.856e-01 |g| 6.159e+00 CG   1
cg reaches trust region boundary
iter  1 act 1.459e-04 pre 1.459e-04 delta 5.857e-05 f 3.856e-01 |g| 4.982e+00 CG   1
iter  2 act 1.625e-05 pre 1.625e-05 delta 2.891e-04 f 3.854e-01 |g| 2.257e-01 CG   1
cg reaches trust region boundary
iter  2 act 1.304e-05 pre 1.304e-05 delta 2.343e-04 f 3.855e-01 |g| 2.256e-01 CG   1
cg reaches trust region boundary
iter  3 act 6.346e-05 pre 6.346e-05 delta 1.157e-03 f 3.854e-01 |g| 2.443e-01 CG   2
cg reaches trust region boundary
cg reaches trust region boundary
iter  3 act 5.234e-05 pre 5.234e-05 delta 9.372e-04 f 3.854e-01 |g| 2.781e-01 CG   2
iter  4 act 2.610e-04 pre 2.610e-04 delta 4.626e-03 f 3.853e-01 |g| 5.851e-01 CG   2
cg reaches trust region boundary
cg reaches trust region boundary
iter  5 act 1.017e-03 pre 1.017e-03 delta 1.850e-02 f 3.850e-01 |g| 2.422e-01 CG   2
iter  4 act 2.107e-04 pre 2.107e-04 delta 3.749e-03 f 3.854e-01 |g| 3.841e-01 CG   2
cg r

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   10.5s remaining:   15.8s


iter 18 act 3.758e+02 pre 3.748e+02 delta 7.194e-01 f 2.302e+07 |g| 3.904e+04 CG  17
iter 19 act 1.360e+00 pre 1.360e+00 delta 7.194e-01 f 2.302e+07 |g| 5.844e+03 CG   6
iter 20 act 2.133e-01 pre 2.133e-01 delta 7.194e-01 f 2.302e+07 |g| 5.893e+02 CG  16
iter  1 act 1.510e+04 pre 1.510e+04 delta 5.979e-05 f 3.856e+07 |g| 5.053e+08 CG   1
cg reaches trust region boundary
iter  2 act 1.310e+03 pre 1.310e+03 delta 2.392e-04 f 3.854e+07 |g| 2.218e+07 CG   1
cg reaches trust region boundary
iter  3 act 5.254e+03 pre 5.254e+03 delta 9.566e-04 f 3.854e+07 |g| 2.708e+07 CG   2
cg reaches trust region boundary
iter  4 act 2.122e+04 pre 2.122e+04 delta 3.827e-03 f 3.854e+07 |g| 3.856e+07 CG   2
cg reaches trust region boundary
iter  5 act 8.425e+04 pre 8.425e+04 delta 1.531e-02 f 3.852e+07 |g| 2.702e+07 CG   2
cg reaches trust region boundary
iter  6 act 3.308e+05 pre 3.308e+05 delta 6.122e-02 f 3.843e+07 |g| 3.817e+07 CG   2
cg reaches trust region boundary
iter  7 act 1.223e+06 pre 1.223e+06 d

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.8s finished


In [17]:
lr.score(X_test_df, y_test_df)

0.7780333525014376

In [14]:
from sklearn.svm import SVC

svc = SVC(tol=1e-6, max_iter=100000, verbose=True)
svc.fit(X_train_df, y_train_df.to_numpy().ravel())

[LibSVM]....*
optimization finished, #iter = 4253
obj = -6413.951573, rho = -0.181695
nSV = 6519, nBSV = 6512
Total nSV = 6519


In [15]:
svc.score(X_test_df, y_test_df)

0.5612420931569868

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=100), n_estimators=100)
ada.fit(X_train_df, y_train_df.to_numpy().ravel())

In [17]:
ada.score(X_test_df, y_test_df)

0.7527314548591144