# checkpoint loading

In [5]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

import os, sys
sys.path.append( os.path.join('..', '..') )

from trainer import model, util

In [34]:
model_dir="gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1"

batch_size=30000

data_mng = util.DataManager(
    train_path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv", 
    test_path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv", 
    user_features=["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"], 
    product_id_col_name=["Product_ID", "Product_Category_1"],
    train_batch_size = batch_size, 
    test_batch_size = batch_size,    
    shuffle_buffer_size = batch_size, 
    cache_test_set = False
)

bf_model = model.create_bf_model(
    user_features=data_mng.user_unique_values,
    product_features=data_mng.product_unique_values,
    embedding_dim=128,
    learning_rate=0.01,
    topk_candidates=data_mng.tf_product_unique_records,
    topk_metric_batch_size=batch_size,
    user_layers=[512, 256, 128],
    product_layers=[512, 256, 128],
    user_input_embedding_dim=16,
    user_input_embedding_l1=0.0,
    user_input_embedding_l2=0.0,
    product_input_embedding_dim=128,
    product_input_embedding_l1=0.0,
    product_input_embedding_l2=0.0,
    temperature=1
)

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


In [35]:
bf_model.optimizer.iterations.numpy()

0

In [36]:
latest = tf.train.latest_checkpoint(model_dir)
latest

'gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/model_checkpoints'

In [37]:
bf_model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbb82d2b810>

In [38]:
bf_model.optimizer.iterations.numpy()

1500

In [14]:
tf.train.latest_checkpoint(model_dir+"/model_checkpoints") is None

True

# resuming a previous training

In [41]:
!gsutil -m cp -r gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1 gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/

Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/Scann/variables/variables.data-00000-of-00001...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/Scann/variables/variables.index...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/candidate/variables/variables.index...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/Scann/saved_model.pb...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/candidate/variables/variables.data-00000-of-00001...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/candidate/saved_model.pb...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/checkpoint...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/model_checkpoints.data-00000-o

In [51]:
import tensorflow as tf

In [68]:
p="gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/train/"
eventsfile=None
try:
    eventsfile=[os.path.join(p,x) for x in tf.io.gfile.listdir(p) if x.startswith("events")][0]
except:
    pass

In [69]:
eventsfile

'gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/012_hptuning_deep/1/train/events.out.tfevents.1622812818.cmle-training-15391287979773683023.389.566.v2'

In [73]:
from tensorflow.python.summary.summary_iterator import summary_iterator
max_step=0
for i,summary in enumerate(summary_iterator(eventsfile)):
    if summary.step > max_step:
        max_step=summary.step
i,max_step

(4002, 499)

In [47]:
a.step

499

In [20]:
import json

os.environ["USER_FEATURES"] = json.dumps(["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"])
os.environ["PRODUCT_FEATURES"] = json.dumps(["Product_ID", "Product_Category_1"])

In [None]:
%%bash

current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd ${current_dir}"/../.."

python -m trainer.task \
  --train-path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv" \
  --eval-path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv" \
  --job-dir="gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/" \
  --batch-size=30000 \
  --scann-num-neighbors=100 \
  --user-features='["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"]' \
  --num-epochs=10 \
  --learning-rate=0.01 \
  --product-input-embedding-dim=128 \
  --common-layers="[512, 256, 128]" \
  --product-features='["Product_ID", "Product_Category_1"]' \
  --user-input-embedding-dim=16 \
  --embedding-dim=128


In [74]:
%%bash

# JOB_NAME: the name of your job running on AI Platform.
JOB_NAME=bf_012_TEST_$(date +%Y%m%d_%H%M%S)

# REGION: select a region from https://cloud.google.com/ai-platform/training/docs/regions
# or use the default '`us-central1`'. The region is where the model will be deployed.
REGION=europe-west1
PYTHON_VERSION=3.7
RUNTIME_VERSION=2.4

current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd ${current_dir}"/../.."

gcloud ai-platform jobs submit training "${JOB_NAME}" \
  --package-path trainer/ \
  --module-name trainer.task \
  --region ${REGION} \
  --python-version ${PYTHON_VERSION} \
  --runtime-version ${RUNTIME_VERSION} \
  --job-dir "gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/" \
  -- \
  --train-path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv" \
  --eval-path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv" \ \
  --job-dir="gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/" \
  --batch-size=30000 \
  --scann-num-neighbors=100 \
  --user-features="${USER_FEATURES}" \
  --num-epochs=10 \
  --learning-rate=0.01 \
  --product-input-embedding-dim=128 \
  --common-layers="[512, 256, 128]" \
  --product-features="${PRODUCT_FEATURES}" \
  --user-input-embedding-dim=16 \
  --embedding-dim=128

gcloud ai-platform jobs describe ${JOB_NAME}

jobId: bf_012_TEST_20210618_132325
state: QUEUED
createTime: '2021-06-18T13:23:27Z'
etag: L63F4olbRgs=
jobId: bf_012_TEST_20210618_132325
state: PREPARING
trainingInput:
  args:
  - --train-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv
  - --eval-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv
  - ' '
  - --job-dir=gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/
  - --batch-size=30000
  - --scann-num-neighbors=100
  - --user-features=["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years",
    "Marital_Status"]
  - --num-epochs=10
  - --learning-rate=0.01
  - --product-input-embedding-dim=128
  - --common-layers=[512, 256, 128]
  - --product-features=["Product_ID", "Product_Category_1"]
  - --user-input-embedding-dim=16
  - --embedding-dim=128
  jobDir: gs://mlteam-ml-specialization-2021-blackfriday/tmp/012_hptuning_deep/
  packageUris:
  - gs://mltea

Job [bf_012_TEST_20210618_132325] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe bf_012_TEST_20210618_132325

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs bf_012_TEST_20210618_132325

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/bf_012_TEST_20210618_132325?project=mlteam-ml-specialization-2021

View logs at:
https://console.cloud.google.com/logs?resource=ml_job%2Fjob_id%2Fbf_012_TEST_20210618_132325&project=mlteam-ml-specialization-2021
