# This Prod notebook builds and deploys simple OLS model to predict baby weight

## Outline:
1. Ingest data using BigQuery API.
2. Clean the data.
3. Build model.
4. Deploy it

In [1]:
notebook_run_id = 1
# notebook_run_id is a digit, creating and deploying a new model every time this notebook is run. increment it by 1.
project_name = 'My First Project'
project_id = 'quantum-keep-360100'
regionn = 'us-central1'

ml_project_name = 'natality'
model_name = 'OLS'

In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from google.cloud import bigquery, storage

# xgb.__version__
os.chdir('/home/jupyter/projects_gcp')
time0 = time.time()

In [3]:
query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(10000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,9.312326,False,28,1,40.0
1,7.749249,True,30,1,40.0
2,7.394304,True,27,1,39.0
3,6.750554,False,40,1,41.0
4,8.377566,True,24,1,38.0


In [4]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [5]:
time1 = time.time()
model = LinearRegression()

model.fit(x_train, y_train)
model_copied = model

print(time.time()-time1)

0.004226207733154297


In [6]:
y_pred = model.predict(x_test)

for i in range(2):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print('train rmse: ', np.sqrt(mean_squared_error(y_train, model.predict(x_train))))
print('test rmse: ', np.sqrt(mean_squared_error(y_test, model.predict(x_test))))
print('model training time: ', time.time() - time0)

Predicted weight:  6.249361
Actual weight:  7.43839671988

Predicted weight:  7.660574
Actual weight:  8.062304921339999

train rmse:  1.0566742247561989
test rmse:  1.058265231151202
model training time:  2.410808563232422


### 4. Model Deployment

I based the code below on https://supertype.ai/notes/deploying-machine-learning-models-with-vertex-ai-on-google-cloud-platform/

In [7]:
deployment_time_start = time.time()

model_path = os.getcwd()+'/natality/artifacts/model_ols/'

# Save model artifact to local filesystem (doesn't persist)
artifact_filename = 'model.pkl'
with open(model_path+artifact_filename, 'wb') as model_file:
  pickle.dump(model, model_file)

In [8]:
# Upload model artifact to Cloud Storage
# Change the model directory to your GCS bucket URI
model_bucket = 'gs://pmykola-projectsgcp-artifacts/natality-ols'
storage_path = os.path.join(model_bucket, artifact_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
# previously it was 'project_id'
blob.upload_from_filename(model_path+artifact_filename)

In [9]:
from google.cloud import aiplatform

# Use this line so we do not need to explicitly specify the project number and region whenever we use AI Platform (Vertex AI) services
aiplatform.init(project=project_id, location=regionn)

# Importing model artifacts
model = aiplatform.Model.upload(display_name = ml_project_name+model_name+str(notebook_run_id),
    description = ml_project_name+model_name+str(notebook_run_id),
    artifact_uri = model_bucket,
    serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest'
)

Creating Model
Create Model backing LRO: projects/234443118908/locations/us-central1/models/8830498940064366592/operations/8526863650093268992
Model created. Resource name: projects/234443118908/locations/us-central1/models/8830498940064366592@1
To use this Model in another session:
model = aiplatform.Model('projects/234443118908/locations/us-central1/models/8830498940064366592@1')


In [10]:
# optional code to create an endpoint
endpoint = aiplatform.Endpoint.create(display_name = ml_project_name+model_name+str(notebook_run_id), 
                                      project = project_id, 
                                      location = regionn)
endpoint_id = endpoint.resource_name[-19:0]

Creating Endpoint
Create Endpoint backing LRO: projects/234443118908/locations/us-central1/endpoints/3851567241866772480/operations/1904601873021665280
Endpoint created. Resource name: projects/234443118908/locations/us-central1/endpoints/3851567241866772480
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/234443118908/locations/us-central1/endpoints/3851567241866772480')


In [11]:
# if you do not specify the endpoint parameter, a new endpoint will be created
# this step is low. On a weak machine it runs for 3-6 minutes.
# if the VM runs more jobs or the model is complex, it may be even longer.
model.deploy(endpoint = endpoint,
             machine_type = 'n1-standard-2')

Deploying model to Endpoint : projects/234443118908/locations/us-central1/endpoints/3851567241866772480
Deploy Endpoint model backing LRO: projects/234443118908/locations/us-central1/endpoints/3851567241866772480/operations/7092467168775766016
Endpoint model deployed. Resource name: projects/234443118908/locations/us-central1/endpoints/3851567241866772480


<google.cloud.aiplatform.models.Endpoint object at 0x7fdd93e92450> 
resource name: projects/234443118908/locations/us-central1/endpoints/3851567241866772480

In [12]:
display(endpoint.predict(instances=[[1.0,15.0,1.0,39.0]]))
endpoint_id = endpoint.resource_name[-19:]
display(endpoint_id)

Prediction(predictions=[7.283284574747086], deployed_model_id='4796531515242577920', model_version_id='1', model_resource_name='projects/234443118908/locations/us-central1/models/8830498940064366592', explanations=None)

In [14]:
import json

payload = {'instances': [[1.0,15.0,1.0,39.0], [1.0,25.0,1.0,39.0]]}

# Parse JSON
with open('request.json', 'w') as outfile:
    json.dump(payload, outfile)

!gcloud ai endpoints predict $endpoint_id \
  --region=$regionn \
  --json-request=request.json

Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[7.283284574747086, 7.462946087121964]


In [15]:
print('Model deployment time: ', time.time() - deployment_time_start)

Model deployment time:  819.6511216163635
