### This notebook builds a RF model for Spaceship Titanic project

In [1]:
project_name = 'ML-projects-gen3'
project_id = 'polished-vault-379315'
app_folder = '/home/jupyter/project_repos/SpaceTitanic/spacetitanic-app'
data_path = '/home/jupyter/projects_data/spacetitanic'
model_bucket = 'gs://mpg3-model-artifacts/spacetitanic'
ml_project_name = 'space_titanic'
model_name = 'XGB'

In [2]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, optuna, shap, pickle, joblib
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 40)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

# Load custom pre-processing functions:

def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
    """
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """
    This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])
    """
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

    
def add_dummyfeatures(df_train, df_test, df_pred, feature_dict):
    """
    This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
    """
    input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
        df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    
    
# 1. Load data #

In [3]:
os.chdir(app_folder)

In [4]:
time0 = time.time()

df = pd.read_csv(data_path + '/train.csv') 
df0 = df.copy()

pred=pd.read_csv(data_path + '/test.csv')
pred0 = pred.copy()

print(df.shape, pred.shape)
df.head()

(8693, 14) (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
# 2. pEDA #
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
df[['Group_Id', 'Passeng_Id']] = df['PassengerId'].str.split('_', 1, expand=True)
pred[['Deck', 'Room', 'Side']] = pred['Cabin'].str.split('/', 2, expand=True)
pred[['Group_Id', 'Passeng_Id']] = pred['PassengerId'].str.split('_', 1, expand=True)
df.drop(columns = ['PassengerId', 'Cabin', 'Name', 'Passeng_Id'], inplace = True)
pred.drop(columns = ['PassengerId', 'Cabin', 'Name', 'Passeng_Id'], inplace = True)
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Room,Side,Group_Id
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,1
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,2
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,3
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,3
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,4


In [6]:
# 3. Train-test split #

train_y = df[['Transported']]
train_x = df.drop(columns = ['Transported'])
X_pred = pred.copy()

cat_cols = [col for col in train_x.columns if train_x[col].nunique() in range(2,10)]
num_cols = list(set(train_x.columns)-set(cat_cols)-{'Group_Id', 'Room'})

print(f'categorical features: {cat_cols} numerical features: {num_cols}')

X_train, X_test, y_train, y_test = train_test_split(train_x, 
                                                    train_y, 
                                                    test_size = 0.1, 
                                                    random_state=4)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

X_train.info()

# add room and group variables:

roomsize = X_train.groupby('Room').size().to_frame()
roomsize.reset_index(inplace=True)
roomsize.columns = ['Room', 'Num_ppl_room']
roomsize.sort_values('Num_ppl_room', inplace = True)
X_train = pd.merge(X_train, roomsize, on='Room', how='left')
X_test = pd.merge(X_test, roomsize, on='Room', how='left')
X_pred = pd.merge(X_pred, roomsize, on='Room', how='left')
groupsize = X_train.groupby('Group_Id').size().to_frame()
groupsize.reset_index(inplace=True)
groupsize.columns = ['Group_Id', 'Num_ppl_group']
groupsize.sort_values('Num_ppl_group', inplace = True)
X_train = pd.merge(X_train, groupsize, on='Group_Id', how='left')
X_test = pd.merge(X_test, groupsize, on='Group_Id', how='left')
X_pred = pd.merge(X_pred, groupsize, on='Group_Id', how='left')

num_cols.extend(['Num_ppl_room', 'Num_ppl_group'])


# 4. Missing values #

#add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService',
#'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side'])
fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
X_train.info()

# 5. Feature engineering #

# add useful variables:
# add_dummyfeatures(X_train, X_test, X_pred, 
#                   {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
# ZeroFC dummy seems to hurt results...

# add more features 

X_train.drop(columns=['Room', 'Group_Id'], inplace= True)
X_train.loc[(X_train['RoomService']==0)&
            (X_train['FoodCourt']==0)&
            (X_train['ShoppingMall']==0)&
            (X_train['Spa']==0)&
            (X_train['VRDeck']==0),'ZeroService'] = 1
X_train.loc[X_train.ZeroService.isnull(), 'ZeroService'] = 0

X_test.drop(columns=['Room', 'Group_Id'], inplace= True)
X_test.loc[(X_test['RoomService']==0)&
           (X_test['FoodCourt']==0)&
           (X_test['ShoppingMall']==0)&
           (X_test['Spa']==0)&
           (X_test['VRDeck']==0),'ZeroService'] = 1
X_test.loc[X_test.ZeroService.isnull(), 'ZeroService'] = 0

X_pred.drop(columns=['Room', 'Group_Id'], inplace= True)
X_pred.loc[(X_pred['RoomService']==0)&
           (X_pred['FoodCourt']==0)&
           (X_pred['ShoppingMall']==0)&
           (X_pred['Spa']==0)&
           (X_pred['VRDeck']==0),'ZeroService'] = 1
X_pred.loc[X_pred.ZeroService.isnull(), 'ZeroService'] = 0

# cat_cols.extend(['RoomService0', 'Spa0', 'VRDeck0', 'ShoppingMall0', 'ZeroService'])
cat_cols.extend(['ZeroService'])

# X_train.describe()
# X_train[:5]

feature_transformer = ColumnTransformer([
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), cat_cols),
    ], remainder = "passthrough")

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

# restrict feature set to make it easy to deploy via a webapp:

feature_set = ['cat__HomePlanet_Earth', 
               'cat__HomePlanet_Europa',
               'cat__CryoSleep_True',
               'cat__Deck_G', 
               'cat__Side_S',
               'cat__ZeroService_1.0', 
               'remainder__RoomService',
               'remainder__FoodCourt', 
               'remainder__ShoppingMall', 
               'remainder__Spa',
               'remainder__VRDeck']

# feature_set = ['cat__HomePlanet_Earth', 
#                'cat__HomePlanet_Europa',
#                'cat__CryoSleep_True',
#                'cat__Deck_G', 
#                'cat__Side_S',
#                'cat__ZeroService_1.0']

X_train = X_train[feature_set]
X_test = X_test[feature_set]
X_pred = X_pred[feature_set]

print(X_train.shape)
display(X_train.head())
print(X_train.columns)

categorical features: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'] numerical features: ['RoomService', 'Spa', 'VRDeck', 'Age', 'FoodCourt', 'ShoppingMall']
(7823, 14) (870, 14) (7823, 1) (4277, 14)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7823 entries, 1670 to 1146
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    7634 non-null   object 
 1   CryoSleep     7622 non-null   object 
 2   Destination   7661 non-null   object 
 3   Age           7659 non-null   float64
 4   VIP           7643 non-null   object 
 5   RoomService   7656 non-null   float64
 6   FoodCourt     7650 non-null   float64
 7   ShoppingMall  7639 non-null   float64
 8   Spa           7653 non-null   float64
 9   VRDeck        7649 non-null   float64
 10  Deck          7639 non-null   object 
 11  Room          7639 non-null   object 
 12  Side          7639 non-null   object 
 13  Group_Id      7823 no

Unnamed: 0,cat__HomePlanet_Earth,cat__HomePlanet_Europa,cat__CryoSleep_True,cat__Deck_G,cat__Side_S,cat__ZeroService_1.0,remainder__RoomService,remainder__FoodCourt,remainder__ShoppingMall,remainder__Spa,remainder__VRDeck
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8.0,3409.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,294.0,79.0,536.0,0.0,0.0


Index(['cat__HomePlanet_Earth', 'cat__HomePlanet_Europa',
       'cat__CryoSleep_True', 'cat__Deck_G', 'cat__Side_S',
       'cat__ZeroService_1.0', 'remainder__RoomService',
       'remainder__FoodCourt', 'remainder__ShoppingMall', 'remainder__Spa',
       'remainder__VRDeck'],
      dtype='object')


In [7]:
# 6. Fit rf and xgb #

time1 = time.time()
rf = RandomForestClassifier(
                    n_jobs=-1,
                    n_estimators=300,
                    max_depth=7
)

xgb = XGBClassifier(tree_method='gpu_hist',
                   n_estimators=300,
                   eta=0.12, 
                   max_depth=5,
                   subsample=0.6,
                   colsample_bytree=0.6)

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
print(f'''RF in sample: {accuracy_score(y_train, rf.predict(X_train)):.3f}''')
print(f'''RF out of sample: {accuracy_score(y_test, rf.predict(X_test)):.3f}''')
print(f'''XGB in sample: {accuracy_score(y_train, xgb.predict(X_train)):.3f}''')
print(f'''XGB out of sample: {accuracy_score(y_test, xgb.predict(X_test)):.3f}''')
print(f'Total time: {time.time()-time0:.3f}')

RF in sample: 0.810
RF out of sample: 0.815
XGB in sample: 0.850
XGB out of sample: 0.817
Total time: 4.444


In [8]:
print(os.getcwd())
artifact_filename_rf = 'rf_model.pkl'
joblib.dump(rf, artifact_filename_rf)

/home/jupyter/project_repos/SpaceTitanic/spacetitanic-app


['rf_model.pkl']

In [9]:
storage_path = os.path.join(model_bucket, artifact_filename_rf)
blob = storage.blob.Blob.from_string(storage_path, 
                                     client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_rf)

In [10]:
file = open(artifact_filename_rf, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[0,:])])
print(f'''lm prediction: {prediction}. 
Total time is {time.time()-time0:.3f} sec''')

lm prediction: [False]. 
Total time is 7.075 sec


In [11]:
X_train

Unnamed: 0,cat__HomePlanet_Earth,cat__HomePlanet_Europa,cat__CryoSleep_True,cat__Deck_G,cat__Side_S,cat__ZeroService_1.0,remainder__RoomService,remainder__FoodCourt,remainder__ShoppingMall,remainder__Spa,remainder__VRDeck
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8.0,3409.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,294.0,79.0,536.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
7818,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
7819,0.0,1.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,1.0,3167.0
7820,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
7821,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
