In [171]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [172]:
train_df = pd.read_csv('train.csv') # 8693 rows
test_df = pd.read_csv('test.csv') 
df = pd.concat([train_df, test_df], axis=0)

In [173]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12970 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
dtypes: float64(6), object(8)
memory usage: 1.5+ MB


In [175]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [176]:
df['Cabin'].fillna('N/N/N', inplace=True)

deck_num_side = df['Cabin'].apply(lambda x: x.split('/'))
df['CabinDeck'] = list(map(lambda x: x[0], deck_num_side))
df['CabinNum'] = list(map(lambda x: x[1], deck_num_side))
df['CabinSide'] = list(map(lambda x: x[2], deck_num_side))
df.drop('Cabin', axis=1, inplace=True)

In [177]:
colfillmode = ['VIP']
colfillnone = ['Destination', 'HomePlanet', 'Name']
colfillmedian = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for col in colfillmode:
    df[col].fillna(df[col].mode()[0], inplace=True)
for col in colfillnone:
    df[col].fillna('None', inplace=True)
for col in colfillmedian:
    df[col].fillna(df[col].median(), inplace=True)
# can attempt to fill destination & homeplanet w/ group
# if no money spent, more likely sleeping. if money spent, no cryo sleep
df['CryoSleep'] = df.apply(lambda row: False if row['RoomService'] > 0 or row['FoodCourt'] > 0 or row['ShoppingMall'] > 0 or row['Spa'] > 0 or row['VRDeck'] > 0 else True, axis=1)


In [178]:
# do pd.get_dummies to ohe this 
df.HomePlanet.value_counts()

Earth     6865
Europa    3133
Mars      2684
None       288
Name: HomePlanet, dtype: int64

In [179]:
df.CryoSleep.value_counts()

False    7513
True     5457
Name: CryoSleep, dtype: int64

In [180]:
gggg_pp = df['PassengerId'].apply(lambda x: x.split('_')).values
df['GGGG'] = list(map(lambda x: x[0], gggg_pp))
df['PP'] = list(map(lambda x: x[1], gggg_pp))
grouplist = {}
for i in df['GGGG']:
    if i not in grouplist:
        grouplist[i] = 1
    else:
        grouplist[i] += 1
df['GroupCount'] = df['GGGG'].apply(lambda x: grouplist[x])


In [181]:
df.drop(['GGGG', 'PP', 'PassengerId', 'Name', 'CabinNum', 'CabinDeck'], axis=1, inplace=True)

In [189]:
train = df.iloc[:8693, :]
test = df.iloc[8693:, :]
X_train = train.drop(['Transported'], axis=1)
y_train = train['Transported'].astype('int')
X_test = test.drop(['Transported'], axis=1)

In [190]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [191]:
clf = RandomForestClassifier()
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_val_score(clf, X_train, y_train, cv=kf)
# print(scores.mean())
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [193]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': preds.astype('bool')})
output.to_csv('submission.csv', index=False)