# Item-to-item Recommendation using Cooccurrence and Matrix Factorization (Part 3)

This tutorial shows how to use Matrix Factorization algorithm in BigQuery ML to generate embeddings for items based on their cooccurrence statistics. The generated item embeddings can be then used to find similar items.

Part 3 covers serving the embeddings in Cloud AI Platform Prediction as an item-embedding lookup.



## Setup

In [None]:
!pip install -q -U pip
!pip install -q tensorflow==2.2.0
!pip install -q -U google-auth google-api-python-client google-api-core

### Import libraries

In [None]:
import os
import tensorflow as tf
import numpy as np
print(f'Tensorflow version: {tf.__version__}')

### Configure GCP environment settings

In [None]:
PROJECT_ID = 'ksalama-cloudml'
BUCKET = 'ksalama-cloudml'
REGION = 'us-central1'
MODEL_NAME = 'songs_embeddings'
MODEL_VERSION = 'v1'
EMBEDDING_FILES_PREFIX = f'gs://{BUCKET}/bqml/embeddings-*'
OUTPUT_MODEL_DIR = f'gs://{BUCKET}/bqml/embedding_model'

!gcloud config set project $PROJECT_ID

### Authenticate your GCP account
This is required if you run the notebook in Colab

In [None]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

## Serve an Embedding Lookup Model on AI Platform Prediction

### Implement EmbeddingModel with Keras

In [None]:
class EmbeddingModel(tf.keras.Model):

  def __init__(self, embedding_files_prefix, **kwargs):
    super(EmbeddingModel, self).__init__(**kwargs)

    vocabulary = list()
    embeddings = list()

    # Read embeddings from csv files.
    print('Loading embeddings from files ...')
    for embedding_file in tf.io.gfile.glob(embedding_files_prefix):
      print(f'Loading embeddings in {embedding_file} ...')
      with tf.io.gfile.GFile(embedding_file, 'r') as lines:
        for line in lines:
          try:
            line_parts = line.split(',')
            item = line_parts[0]
            embedding = np.array([float(v) for v in line_parts[1:]])
            vocabulary.append(item)
            embeddings.append(embedding)
          except: pass

    print('Embeddings loaded.')
    embedding_size = len(embeddings[0])
    oov_embedding = np.zeros((1, embedding_size))
    self.embeddings = np.append(np.array(embeddings), oov_embedding, axis=0)
    print(f'Embeddings: {self.embeddings.shape}')
    
    
    # Write vocabualry file.
    print('Writing vocabulary to file ...')
    with open('vocabulary.txt', 'w') as f:
      for item in vocabulary: 
        f.write(f'{item}\n')
    print('Vocabulary file written and will be added as a model asset.')
    
    self.vocabulary_file = tf.saved_model.Asset('vocabulary.txt')
   

    initializer = tf.lookup.KeyValueTensorInitializer(
        keys=vocabulary, values=list(range(len(vocabulary))))
    self.token_to_id = tf.lookup.StaticHashTable(
        initializer, default_value=len(vocabulary))

  @tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
  def __call__(self, inputs):
    tokens = tf.strings.split(inputs, sep=None).to_sparse()
    ids = self.token_to_id.lookup(tokens) 
    embeddings = tf.nn.embedding_lookup_sparse(
        params=self.embeddings, 
        sp_ids=ids, 
        sp_weights=None, 
        combiner="mean"
    )
    return embeddings

### Save the model with Serving Signatures

In [None]:
embedding_model = EmbeddingModel(EMBEDDING_FILES_PREFIX)

In [None]:
if tf.io.gfile.exists(OUTPUT_MODEL_DIR):
  print("Removing {} contents...".format(OUTPUT_MODEL_DIR))
  tf.io.gfile.rmtree(OUTPUT_MODEL_DIR)

signatures = {
    'serving_default': embedding_model.__call__.get_concrete_function(),
}

tf.saved_model.save(embedding_model, OUTPUT_MODEL_DIR, signatures=signatures)

In [None]:
!saved_model_cli show --dir {OUTPUT_MODEL_DIR} --tag_set serve --signature_def serving_default

In [None]:
loaded_model = tf.saved_model.load(OUTPUT_MODEL_DIR)

In [None]:
input_items = ['2114406', '2114402 2120788', 'abc123']
output = loaded_model(input_items)
print(output.shape)
print(output[2])

### Deploy the model to AI Platform Prediction

In [None]:
!gcloud ai-platform models create {MODEL_NAME} --region={REGION}

In [None]:
!gcloud ai-platform versions create {MODEL_VERSION} \
  --region={REGION} \
  --model={MODEL_NAME} \
  --origin={OUTPUT_MODEL_DIR} \
  --runtime-version=2.2 \
  --framework=TensorFlow \
  --python-version=3.7 \
  --machine-type=n1-standard-2

print("The model version is deployed to AI Platform Prediciton.")

In [None]:
!gcloud ai-platform versions list --model={MODEL_NAME} --region={REGION}

### Test the deployed model

In [None]:
import googleapiclient.discovery
from google.api_core.client_options import ClientOptions

api_endpoint = f'https://{REGION}-ml.googleapis.com'
client_options = ClientOptions(api_endpoint=api_endpoint)
service = googleapiclient.discovery.build(
    serviceName='ml', version='v1', client_options=client_options)
name = f'projects/{PROJECT_ID}/models/{MODEL_NAME}/versions/{MODEL_VERSION}'
print(f'Service name: {name}')

def caip_embedding_lookup(input_items):
  request_body = {'instances': input_items}
  response = service.projects().predict(name=name, body=request_body).execute()

  if 'error' in response:
    raise RuntimeError(response['error'])

  return response['predictions']

In [None]:
input_items = ['2114406', '2114402 2120788', 'abc123']

embeddings = caip_embedding_lookup(input_items)
print(f'Embeddings retrieved: {len(embeddings)}')
for idx, embedding in enumerate(embeddings):
  print(f'{input_items[idx]}: {embedding[:5]}')

## License

Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

See the License for the specific language governing permissions and limitations under the License.

**This is not an official Google product but sample code provided for an educational purpose**