### This is Prod script, building and deploying simple XGB model for Titanic

Since it uses only RF and XGBoost, it is simpler than dev script for this project


In [1]:
notebook_run_id = 1
# notebook_run_id is a digit, creating and deploying a new model every time this notebook is run. increment it by 1.
project_name = 'My First Project'
project_id = 'quantum-keep-360100'
regionn = 'us-central1'

ml_project_name = 'titanic'
model_name = 'RF'

In [13]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, optuna, pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 20)
warnings.filterwarnings('ignore')

# Load custom pre-processing functions:

def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   


In [3]:
# 1. Load data #

time0 = time.time()

os.chdir('/home/jupyter/projects_data/titanic')
df = pd.read_csv('train.csv') 

df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)
pred = pd.read_csv('test.csv')
pred.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)

print(df.shape, pred.shape)

(891, 8) (418, 7)


In [4]:
# 2. EDA, adding features #

df['Age2'] = df['Age']**2
pred['Age2'] = pred['Age']**2

# 3. Train-test split #

train_y = df[['Survived']]
train_x = df.drop(columns = ['Survived'])
X_pred = pred.copy()

cat_cols = ['Sex', 'Embarked']
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=4)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

X_train.info()

categorical features:  ['Sex', 'Embarked'] numerical features:  ['Parch', 'Age2', 'SibSp', 'Age', 'Pclass', 'Fare']
(712, 8) (179, 8) (712, 1) (418, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 42 to 122
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       570 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
 7   Age2      570 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 50.1+ KB


In [5]:
# 4. Misisng values #

add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])

fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)

cat_cols.extend(['misAge'])

feature_transformer = ColumnTransformer([
        ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), cat_cols)],
        remainder = "passthrough"
    )

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

Missing values imputed successfully


In [6]:
# 6. Fit models #

time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100, 200], 
              'max_depth':[3, 4, 5, 6, 7], 
              'max_features':[4, 5, 6]}
rfm = GridSearchCV(rf, param_grid, cv=2)
rfm.fit(X_train, y_train)
print('RF ', 
      rfm.best_params_, 
      '\n',
      accuracy_score(y_train, rfm.predict(X_train)), 
      roc_auc_score(y_train, rfm.predict(X_train)), time.time()-time1)

time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 10,000 examples.
param_grid = {'n_estimators':[150, 250], 
              'max_depth':[2, 3, 4], 
              'eta':[0.01, 0.02, 0.03, 0.04, 0.05, 0.06], 
              'subsample':[0.7],
              'colsample_bytree':[0.6]}
xgbm = GridSearchCV(xgb, param_grid, cv=2)
xgbm.fit(X_train, y_train)
print('XGB ', 
      xgbm.best_params_, 
      '\n',
      accuracy_score(y_train, xgbm.predict(X_train)), 
      roc_auc_score(y_train, xgbm.predict(X_train)), 
      time.time()-time1)


# 7. model evaluation #

print('Out of Sample:')
print('RF ', 
      accuracy_score(y_test, rfm.predict(X_test)), 
      roc_auc_score(y_test, rfm.predict(X_test)))
print('XGB ', 
      accuracy_score(y_test, xgbm.predict(X_test)), 
      roc_auc_score(y_test, xgbm.predict(X_test)))
print('Total time ', time.time()-time0)

print('Total time for training part: ', time.time() - time0)

RF  {'max_depth': 6, 'max_features': 5, 'n_estimators': 200} 
 0.8721910112359551 0.8532986970146792 14.417773723602295
XGB  {'colsample_bytree': 0.6, 'eta': 0.01, 'max_depth': 4, 'n_estimators': 250, 'subsample': 0.7} 
 0.8581460674157303 0.8386194952993569 38.61670207977295
Out of Sample:
RF  0.8324022346368715 0.7871848739495798
XGB  0.8212290502793296 0.7829131652661064
Total time  53.256232023239136
Total time for training part:  53.2563054561615


The results are somewhat surprising. I have played for more than 1 hours with hyprparmeters and RF still usually beats XGB. 
If I do hyperparemter tuning rigorously (e.g., Optuna), xgb will probably beat RF eventually. But do not want to waste more time on this, given that thi is Prod script. So I use RF.

#### RF Model deployment

In [11]:
deployment_time_start = time.time()

os.chdir('/home/jupyter/projects_gcp_cpu')

model_path = os.getcwd()+'/titanic/artifacts/model_rf/'

# Save model artifact to local filesystem (doesn't persist)
artifact_filename = 'model.pkl'
with open(model_path+artifact_filename, 'wb') as model_file:
  pickle.dump(rfm, model_file)

In [14]:
# Upload model artifact to Cloud Storage
# Change the model directory to your GCS bucket URI
model_bucket = 'gs://pmykola-projectsgcp-artifacts/titanic-rf'
storage_path = os.path.join(model_bucket, artifact_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
# previously it was 'project_id'
blob.upload_from_filename(model_path+artifact_filename)

In [17]:
from google.cloud import aiplatform

# Use this line so we do not need to explicitly specify the project number and region whenever we use AI Platform (Vertex AI) services
aiplatform.init(project=project_id, location=regionn)

# Importing model artifacts
model = aiplatform.Model.upload(display_name = ml_project_name+model_name+str(notebook_run_id),
    description = ml_project_name+model_name+str(notebook_run_id),
    artifact_uri = model_bucket,
    serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest'
)

Creating Model
Create Model backing LRO: projects/234443118908/locations/us-central1/models/2948797826718498816/operations/4305759346224005120
Model created. Resource name: projects/234443118908/locations/us-central1/models/2948797826718498816@1
To use this Model in another session:
model = aiplatform.Model('projects/234443118908/locations/us-central1/models/2948797826718498816@1')


In [18]:
endpoint = aiplatform.Endpoint.create(display_name = ml_project_name+model_name+str(notebook_run_id), 
                                      project = project_id, 
                                      location = regionn)
endpoint_id = endpoint.resource_name[-19:0]

Creating Endpoint
Create Endpoint backing LRO: projects/234443118908/locations/us-central1/endpoints/15626259253952512/operations/5449673651576111104
Endpoint created. Resource name: projects/234443118908/locations/us-central1/endpoints/15626259253952512
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/234443118908/locations/us-central1/endpoints/15626259253952512')


In [19]:
# even RF is slow to deploy...
# at least 8 min
# have to experiment with more powerful machines, maybe they will work faster
model.deploy(endpoint = endpoint,
             machine_type = 'n1-standard-2')

Deploying model to Endpoint : projects/234443118908/locations/us-central1/endpoints/15626259253952512
Deploy Endpoint model backing LRO: projects/234443118908/locations/us-central1/endpoints/15626259253952512/operations/6933609728794689536
Endpoint model deployed. Resource name: projects/234443118908/locations/us-central1/endpoints/15626259253952512


<google.cloud.aiplatform.models.Endpoint object at 0x7f6f033f2250> 
resource name: projects/234443118908/locations/us-central1/endpoints/15626259253952512

In [24]:
display(endpoint.predict(instances=[[1.0, 1.0, 0.0, 0.0, 1.0, 3.0, 28.5, 0.0, 0.0, 7.8958, 812.25]]))
endpoint_id = endpoint.resource_name[-19:]
display(endpoint_id)

Prediction(predictions=[0.0], deployed_model_id='5752420536151965696', model_version_id='1', model_resource_name='projects/234443118908/locations/us-central1/models/2948797826718498816', explanations=None)

's/15626259253952512'

In [26]:
import json

payload = {'instances': [[1.0, 1.0, 0.0, 0.0, 1.0, 3.0, 28.5, 0.0, 0.0, 7.8958, 812.25], 
                         [1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 34.0, 0.0, 0.0, 7.8958, 812.25]]}

# Parse JSON
with open('request.json', 'w') as outfile:
    json.dump(payload, outfile)

!gcloud ai endpoints predict $endpoint_id \
  --region=$regionn \
  --json-request=request.json

Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[1;31mERROR:[0m (gcloud.ai.endpoints.predict) HTTPError 404


In [None]:
print('Model deployment time: ', time.time() - deployment_time_start)

In [23]:
list(X_train.iloc[0,:])

[1.0, 1.0, 0.0, 0.0, 1.0, 3.0, 28.5, 0.0, 0.0, 7.8958, 812.25]

In [25]:
X_train

Unnamed: 0,cat__Sex_male,cat__Embarked_C,cat__Embarked_Q,cat__Embarked_S,cat__misAge_1.0,remainder__Pclass,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Fare,remainder__Age2
0,1.0,1.0,0.0,0.0,1.0,3.0,28.5,0.0,0.0,7.8958,812.25
1,1.0,0.0,0.0,1.0,0.0,2.0,60.0,1.0,1.0,39.0000,3600.00
2,1.0,0.0,0.0,1.0,0.0,3.0,36.0,1.0,0.0,15.5500,1296.00
3,0.0,0.0,0.0,1.0,1.0,3.0,28.5,3.0,1.0,25.4667,812.25
4,1.0,0.0,0.0,1.0,1.0,1.0,28.5,0.0,0.0,30.0000,812.25
...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,1.0,0.0,3.0,40.0,1.0,4.0,27.9000,1600.00
708,1.0,1.0,0.0,0.0,1.0,3.0,28.5,1.0,1.0,15.2458,812.25
709,1.0,0.0,0.0,1.0,0.0,2.0,31.0,0.0,0.0,10.5000,961.00
710,1.0,1.0,0.0,0.0,0.0,1.0,56.0,0.0,0.0,30.6958,3136.00
