## Model Versioning Design Pattern

In the Model Versioning design pattern, backward compatibility is achieved by deploying a changed model as a microservice with a different REST endpoint. This is a necessary prerequisite for many of the other patterns discussed in this chapter.

In [16]:
import json
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from google.cloud import bigquery

## Download and preprocess data

You'll need to authenticate to your Google Cloud to run the BigQuery query below.

In [2]:
from google.colab import auth
auth.authenticate_user()

In the following cell, replace `your-cloud-project` with the name of your GCP project.

In [37]:
# Note: this query may take a few minutes to run
%%bigquery df --project your-cloud-project
SELECT
  arr_delay,
  carrier,
  origin,
  dest,
  dep_delay,
  taxi_out,
  distance
FROM
  `cloud-training-demos.flights.tzcorr`
WHERE
  extract(year from fl_date) = 2015
ORDER BY fl_date ASC
LIMIT 300000

In [38]:
df = df.dropna()
df = shuffle(df, random_state=2)

In [39]:
df.head()

Unnamed: 0,arr_delay,carrier,origin,dest,dep_delay,taxi_out,distance
77962,11.0,OO,LAS,SEA,-5.0,24.0,867.0
77162,4.0,DL,BOS,SLC,-1.0,21.0,2105.0
208677,0.0,HA,HNL,LIH,-3.0,16.0,102.0
194182,11.0,WN,ABQ,PHX,13.0,6.0,328.0
296710,1.0,EV,IND,EWR,-7.0,13.0,645.0


In [40]:
# Only include origins and destinations that occur frequently in the dataset
df = df[df['origin'].map(df['origin'].value_counts()) > 500]
df = df[df['dest'].map(df['dest'].value_counts()) > 500]

In [41]:
df = pd.get_dummies(df, columns=['carrier', 'origin', 'dest'])

## Model version #1: predict whether or not the flight is > 30 min delayed

In [42]:
# Create a boolean column to indicate whether flight was > 30 mins delayed
df.loc[df['arr_delay'] >= 30, 'arr_delay_bool'] = 1
df.loc[df['arr_delay'] < 30, 'arr_delay_bool'] = 0

In [43]:
df['arr_delay_bool'].value_counts()

0.0    196299
1.0     34031
Name: arr_delay_bool, dtype: int64

In [44]:
classify_model_labels = df['arr_delay_bool']
classify_model_data = df.drop(columns=['arr_delay', 'arr_delay_bool'])

In [45]:
x,y = classify_model_data,classify_model_labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [46]:
model = xgb.XGBRegressor(
    objective='reg:logistic'
)

In [47]:
# Given the dataset size, this may take 1-2 minutes to run
model.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:logistic', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [48]:
y_pred = model.predict(x_test)

In [49]:
acc = accuracy_score(y_test, np.round(y_pred))
print(acc)

0.964312383863293


In [50]:
# Save the model
model.save_model('model.bst')

### Deploying classification model to AI Platform

Replace `your-cloud-project` below with the name of your cloud project.

In [19]:
# Set your cloud project
PROJECT = 'your-cloud-project'
!gcloud config set project $PROJECT

Updated property [core/project].


In [20]:
BUCKET = PROJECT + 'flight_model_bucket'

In [None]:
# Create a bucket if you don't have one
# You only need to run this once
!gsutil mb gs://$BUCKET

In [21]:
!gsutil cp 'model.bst' gs://$BUCKET

Copying file://model.bst [Content-Type=application/octet-stream]...
/ [1 files][ 66.7 KiB/ 66.7 KiB]                                                
Operation completed over 1 objects/66.7 KiB.                                     


In [25]:
# Create the model resource
!gcloud ai-platform models create flight_delay_prediction

Using endpoint [https://ml.googleapis.com/]

Learn more about regional endpoints and see a list of available regions: https://cloud.google.com/ai-platform/prediction/docs/regional-endpoints
Created ml engine model [projects/sara-cloud-ml/models/flight_delay_prediction].


In [26]:
# Create the version
!gcloud ai-platform versions create 'v1' \
  --model 'flight_delay_prediction' \
  --origin gs://$BUCKET \
  --runtime-version=1.15 \
  --framework 'XGBOOST' \
  --python-version=3.7

Using endpoint [https://ml.googleapis.com/]


In [33]:
# Get a prediction on the first example from our test set
!rm input.json
num_examples = 10
with open('input.json', 'a') as f:
  for i in range(num_examples):
    f.write(str(x_test.iloc[i].values.tolist()))
    f.write('\n')

rm: cannot remove 'input.json': No such file or directory


In [34]:
!cat input.json

[-4.0, 11.0, 591.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[-6.0, 7.0, 237.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [35]:
# Make a prediction to the deployed model
!gcloud ai-platform predict --model 'flight_delay_prediction' --version \
  'v1' --json-instances 'input.json'

Using endpoint [https://ml.googleapis.com/]
[0.0028497891034930944, 0.0024622047785669565, 0.004566137678921223, 0.23578588664531708, 0.019520383328199387, 0.0029183183796703815, 0.014778893440961838, 0.004592736717313528, 0.011670581065118313, 0.04867469519376755]


In [36]:
# Compare this with actual values
print(y_test.iloc[:5])

122790    0.0
279363    0.0
200351    0.0
233004    0.0
196987    0.0
Name: arr_delay_bool, dtype: float64


## Model version #2: predict exact delay amount (regression)

In [53]:
regression_model_labels = df['arr_delay']
regression_model_data = df.drop(columns=['arr_delay', 'arr_delay_bool'])

In [54]:
x,y = regression_model_data,regression_model_labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [71]:
model = xgb.XGBRegressor(
    objective='reg:linear'
)

In [None]:
# This will take 1-2 minutes to run
model.fit(x_train, y_train)

In [73]:
y_pred = model.predict(x_test)

In [None]:
for i,val in enumerate(y_pred[:10]):
  print(val)
  print(y_test.iloc[i])
  print()

In [75]:
model.save_model('model.bst')

In [76]:
!gsutil cp model.bst gs://$BUCKET/regression/

Copying file://model.bst [Content-Type=application/octet-stream]...
/ [1 files][ 67.8 KiB/ 67.8 KiB]                                                
Operation completed over 1 objects/67.8 KiB.                                     


In [79]:
# Create the version
!gcloud ai-platform versions create 'v_regression' \
  --model 'flight_delay_prediction' \
  --origin gs://$BUCKET/regression \
  --runtime-version=1.15 \
  --framework 'XGBOOST' \
  --python-version=3.7

Using endpoint [https://ml.googleapis.com/]


In [None]:
!gcloud ai-platform predict --model 'flight_delay_prediction' --version \
  'v_regression' --json-instances 'input.json'

Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License