In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import warnings
import os

warnings.filterwarnings("ignore")

df_train = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/train.csv')
df_test = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/test.csv')

data = pd.concat([df_train, df_test], axis = 0)
data.reset_index(drop = True, inplace = True)
data.head()

# **Data Exploration**

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
display(data.groupby('HomePlanet')['Transported'].mean())
sns.countplot(data, x = 'HomePlanet', hue = 'Transported')

In [None]:
display(data.groupby('CryoSleep')['Transported'].mean())
sns.countplot(data, x = 'CryoSleep', hue = 'Transported')

* We can see passenger who take CryoSleep has significantly higher chance to get transported

In [None]:
display(data.groupby('Destination')['Transported'].mean())
sns.countplot(data, x = 'Destination', hue = 'Transported')

In [None]:
display(data.groupby('VIP')['Transported'].mean())
sns.countplot(data, x = 'VIP', hue = 'Transported')

In [None]:
display(data.groupby('HomePlanet')['Transported'].mean())
sns.countplot(data, x = 'HomePlanet', hue = 'Transported')

# **Feature Engineering - Base Model**

In [None]:
data['CryoSleep'].fillna(False, inplace = True)

data['CryoSleep'] = data['CryoSleep'].map(lambda x: 1 if x else 0)
data['Transported'] = data['Transported'].map({True: 1, False: 0})

base_train = data[data['Transported'].notnull()]

# **Model Training - Base Model**

In [None]:
base_model = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
base_model.fit(base_train[['CryoSleep']], base_train['Transported'])
print(f"oob score: {base_model.oob_score_}")

In [None]:
base_test = data[data['Transported'].isnull()]
base_output = base_model.predict(base_test[['CryoSleep']]).astype(bool)
# base_output = base_output.map({0: False, 1: True})
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': base_output})
submission.to_csv('Base_Submission.csv', index = False)

* Submitted and get a score of 72.5%
* If I add any feature and get a score less then this, that means the feature is redundant/contains noise

***

# **Data Visualizatin - Model V1**

In [None]:
data['Age'].fillna(-20, inplace = True)
data['Age'] = data['Age'].astype(int)

mask_transported = data.loc[(data['Transported'] == 1), 'Age']
mask_not_transported = data.loc[(data['Transported'] == 0), 'Age']
fig, ax = plt.subplots(figsize = (13, 5))
ax = sns.histplot(mask_transported, kde = False, label = 'transported', bins = 10)
ax = sns.histplot(mask_not_transported, kde = False, label = 'not_transported', bins = 10)
ax.legend()


* Age that has null value has approximately 50% of transported rate
* Age under 10 has higher chance of being transported
* Age over 10 has overally equal rate of being transported

# **Feature Engineering - Model V1**

In [None]:
data.loc[(data['Age'] == -20), 'Age_code'] = 1
data.loc[(data['Age'] <= 10) & (data['Age'] >= 0), 'Age_code'] = 2
data.loc[(data['Age'] > 10), 'Age_code'] = 0

In [None]:
display(data.groupby('Age_code')['Transported'].mean())

# **Model Training - Model V1**

In [None]:
V1_train = data[data['Transported'].notnull()]
model_V1 = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
model_V1.fit(V1_train[['CryoSleep', 'Age_code']], V1_train['Transported'])
print(f"oob score: {model_V1.oob_score_}")

In [None]:
V1_test = data[data['Transported'].isnull()]
V1_output = model_V1.predict(V1_test[['CryoSleep', 'Age_code']]).astype(bool)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': V1_output})
submission.to_csv('V1_Submission.csv', index = False)

* Accuracy has increased by 2% which means this Age_code feature is practical

***

# **Feature Engineering - Model V2**

In [None]:
data['RoomService'].fillna(data['RoomService'].median(), inplace = True)
data['FoodCourt'].fillna(data['FoodCourt'].median(), inplace = True)
data['ShoppingMall'].fillna(data['ShoppingMall'].median(), inplace = True)
data['Spa'].fillna(data['Spa'].median(), inplace = True)
data['VRDeck'].fillna(data['VRDeck'].median(), inplace = True)

data['Total_Cost'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

# **Model Training - Model V2**

In [None]:
V2_train = data[data['Transported'].notnull()]
model_V2 = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
model_V2.fit(V2_train[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']], V2_train['Transported'])
print(f"oob score: {model_V2.oob_score_}")

In [None]:
V2_test = data[data['Transported'].isnull()]
V2_output = model_V2.predict(V2_test[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]).astype(bool)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': V2_output})
submission.to_csv('V2_Submission.csv', index = False)

***

# **Data Visualization - Model V3**

In [None]:
data['Destination'].fillna('TRAPPIST-1e ', inplace = True)
data['HomePlanet'].fillna('Earth ', inplace = True)

In [None]:
data.groupby(['HomePlanet', 'Destination'])['Transported'].mean()

# **Feature Engineering - Model V3**

In [None]:
data['HomePlanet+Destination'] = data['HomePlanet'] + '_' + data['Destination']

le = LabelEncoder()
data['HomePlanet+Destination_code'] = le.fit_transform(data['HomePlanet+Destination'])

# **Model Training - Model V3**

In [None]:
V3_train = data[data['Transported'].notnull()]
model_V3 = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
model_V3.fit(V3_train[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code']], V3_train['Transported'])
print(f"oob score: {model_V3.oob_score_}")

In [None]:
V3_test = data[data['Transported'].isnull()]
V3_output = model_V3.predict(V3_test[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code']]).astype(bool)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': V3_output})
submission.to_csv('V3_Submission.csv', index = False)

***

# **Feature Engineering - Model V4**

In [None]:
cabin = data['Cabin'].str.split('/', expand = True)
data['Desk'], data['Num'], data['Side'] = cabin[0], cabin[1], cabin[2]

In [None]:
data['Desk'].fillna('F', inplace = True)
data['Side'].fillna('S', inplace = True)
le = LabelEncoder()
data['Desk'] = le.fit_transform(data['Desk'])
data['Side'] = le.fit_transform(data['Side'])

# **Model Training - Model V4**

In [None]:
V4_train = data[data['Transported'].notnull()]
model_V4 = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
model_V4.fit(V4_train[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code', 'Desk', 'Side']], V4_train['Transported'])
print(f"oob score: {model_V4.oob_score_}")

In [None]:
V4_test = data[data['Transported'].isnull()]
V4_output = model_V4.predict(V4_test[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code', 'Desk', 'Side']]).astype(bool)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': V4_output})
submission.to_csv('V4_Submission.csv', index = False)

***

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import warnings
import os

warnings.filterwarnings("ignore")

df_train = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/train.csv')
df_test = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/test.csv')

data = pd.concat([df_train, df_test], axis = 0)
data.reset_index(drop = True, inplace = True)

data['CryoSleep'].fillna(False, inplace = True)

data['CryoSleep'] = data['CryoSleep'].map(lambda x: 1 if x else 0)
data['Transported'] = data['Transported'].map({True: 1, False: 0})
data['Age'].fillna(-20, inplace = True)
data['Age'] = data['Age'].astype(int)
data.loc[(data['Age'] == -20), 'Age_code'] = 0
data.loc[(data['Age'] <= 10) & (data['Age'] >= 0), 'Age_code'] = 1
data.loc[(data['Age'] > 10), 'Age_code'] = 2
data['RoomService'].fillna(data['RoomService'].median(), inplace = True)
data['FoodCourt'].fillna(data['FoodCourt'].median(), inplace = True)
data['ShoppingMall'].fillna(data['ShoppingMall'].median(), inplace = True)
data['Spa'].fillna(data['Spa'].median(), inplace = True)
data['VRDeck'].fillna(data['VRDeck'].median(), inplace = True)
data['Surname'] = data['Name'].dropna().str.split(' ', expand = True)[1]
data['Total_Cost'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']
data['Destination'].fillna('TRAPPIST-1e ', inplace = True)
data['HomePlanet'].fillna('Earth ', inplace = True)
data['HomePlanet+Destination'] = data['HomePlanet'] + '_' + data['Destination']
data['Zero_Cost'] = np.where(data['Total_Cost'] == 0, 1, 0)

le = LabelEncoder()
data['HomePlanet+Destination_code'] = le.fit_transform(data['HomePlanet+Destination'])

cabin = data['Cabin'].str.split('/', expand = True)
data['Desk'], data['Num'], data['Side'] = cabin[0], cabin[1], cabin[2]

data['Desk'].fillna('F', inplace = True)
data['Side'].fillna('S', inplace = True)
le = LabelEncoder()
data['Desk'] = le.fit_transform(data['Desk'])
data['Side'] = le.fit_transform(data['Side'])

data.head()

In [None]:
data[['Group_Num', 'Ident']] = data['PassengerId'].str.split('_', expand = True).astype(int)
data['Group_Size'] = data['Group_Num'].apply(lambda x: data['Group_Num'].value_counts()[x])
data['Solo'] = np.where(data['Group_Size'] == 1, 1, 0)

In [None]:
data['Num'].fillna('9999', inplace = True)
data['Num'] = data['Num'].astype(int)
data['Num'] = data['Num'].map(lambda x: np.nan if x == 9999 else x)

In [None]:
data['Num'].value_counts().sort_values(ascending = True)

In [None]:
fig, ax = plt.subplots(figsize = (10, 12))
ax = sns.histplot(data, x = 'Num', hue = 'Transported', binwidth = 20)
ax.vlines(300, ymin=0, ymax=50, color='black')
ax.vlines(600, ymin=0, ymax=50, color='black')
ax.vlines(900, ymin=0, ymax=50, color='black')
ax.vlines(1200, ymin=0, ymax=50, color='black')
ax.vlines(1500, ymin=0, ymax=50, color='black')
ax.vlines(1800, ymin=0, ymax=50, color='black')
ax.set_title('Cabin number')
ax.set_xlim([0,2000])

In [None]:
data['Num_code_1'] = (data['Num']<300).astype(int)   # one-hot encoding
data['Num_code_2'] =((data['Num']>=300) & (data['Num']<600)).astype(int)
data['Num_code_3'] =((data['Num']>=600) & (data['Num']<900)).astype(int)
data['Num_code_4'] =((data['Num']>=900) & (data['Num']<1200)).astype(int)
data['Num_code_5'] =((data['Num']>=1200) & (data['Num']<1500)).astype(int)
data['Num_code_6'] =((data['Num']>=1500) & (data['Num']<1800)).astype(int)
data['Num_code_7'] =(data['Num']>=1800).astype(int)
data.loc[data['Num'].isnull(), 'Num_code_1'] = 1

In [None]:
V5_train = data[data['Transported'].notnull()]
model_V5 = RandomForestClassifier(n_estimators = 250, random_state = 0, min_samples_split = 20, oob_score = True)
model_V5.fit(V5_train[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code', 'Desk', 'Side', 'Num_code_1', 'Num_code_2', 'Num_code_3','Num_code_4','Num_code_5','Num_code_6','Num_code_7']], V5_train['Transported'])
print(f"oob score: {model_V5.oob_score_}")

In [None]:
V5_test = data[data['Transported'].isnull()]
V5_output = model_V5.predict(V5_test[['CryoSleep', 'Age_code', 'Total_Cost', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet+Destination_code', 'Desk', 'Side', 'Num_code_1', 'Num_code_2', 'Num_code_3','Num_code_4','Num_code_5','Num_code_6','Num_code_7']]).astype(bool)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': V5_output})
submission.to_csv('V5_Submission.csv', index = False)