# Item-to-item Recommendation using Cooccurrence and Matrix Factorization (Part 2)

This tutorial shows how to use Matrix Factorization algorithm in BigQuery ML to generate embeddings for items based on their cooccurrence statistics. The generated item embeddings can be then used to find similar items.

Part 2 covers exporting the trained embeddings from the Matrix Factorization BigQuery ML Model to Cloud Storage.


## Setup

In [None]:
!pip install -U -q apache-beam[gcp]

### Import libraries

In [None]:
import os
import numpy as np
import tensorflow.io as tf_io
import apache_beam as beam
from datetime import datetime

### Configure GCP environment settings

In [None]:
PROJECT_ID = 'ksalama-cloudml'
BUCKET = 'ksalama-cloudml'
REGION = 'europe-west2'
BQ_DATASET_NAME = 'item_recommendations'
BQ_TABLE_NAME = 'playlists'

!gcloud config set project $PROJECT_ID

### Authenticate your GCP account
This is required if you run the notebook in Colab

In [None]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

## Export Trained Embeddings from BigQuery ML to Cloud Storage

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE PROCEDURE item_recommendations.sp_ExractEmbeddings() 
BEGIN
CREATE OR REPLACE TABLE  item_recommendations.item_embeddings AS
SELECT 
  feature AS item_Id,
  processed_input AS axis,
  factor_weights,
  intercept
FROM
  ML.WEIGHTS(MODEL `item_recommendations.item_embedding_cooc`)
WHERE feature != 'global__INTERCEPT__';
END

In [None]:
%%bigquery --project $PROJECT_ID

CALL item_recommendations.sp_ExractEmbeddings() 

### Implement Beam pipeline

In [None]:
def get_query(dataset_name, table_name):
  query = f'''
    SELECT 
      item_Id,
      axis,
      factor_weights
    FROM 
      `{dataset_name}.{table_name}`
  '''
  return query


def parse_embeddings(bq_record):
  item_Id = bq_record['item_Id']
  axis = bq_record['axis']
  intercept = bq_record['intercept']
  factor_weights = bq_record['factor_weights']
  dimensions = len(factor_weights)
  embedding = [0.0] * dimensions
  for idx, entry in enumerate(factor_weights):
    factor, weight = entry['factor'], entry['weight']
    embedding[int(factor) - 1] = float(weight)

  return (item_Id, embedding)


def average_embedding(entry):
  item_id, embedding_pair = entry
  embedding_pair = list(embedding_pair)
  
  if len(embedding_pair) == 2:
    embedding1, embedding2 = embedding_pair
    dimensions = len(embedding1)
    embedding = [0.0] * dimensions
    for idx in range(dimensions):
      embedding[idx] = (embedding1[idx] + embedding2[idx]) / 2.0
  else:
   embedding = embedding_pair[0]
  
  return item_id, embedding


def to_csv(entry):
  item_Id, embedding = entry
  csv_string = f'{item_Id},{item_Id},'
  csv_string += ','.join([str(value) for value in embedding])
  return csv_string

def run_pipeline(args):

    bq_dataset_name = args['bq_dataset_name']
    embeddings_table_name = args['embeddings_table_name']
    output_dir = args['output_dir']
    project = args['project']

    pipeline_options = beam.options.pipeline_options.PipelineOptions(**args)
    with beam.Pipeline(options=pipeline_options) as pipeline:

      query = get_query(bq_dataset_name, embeddings_table_name)
      output_prefix = os.path.join(output_dir, 'embeddings')
      
      _ = (
        pipeline
        | 'ReadFromBigQuery' >> beam.io.Read(beam.io.BigQuerySource(
            project=PROJECT_ID, query=query, use_standard_sql=True, flatten_results=False))
        | 'ParseEmbeddings' >> beam.Map(parse_embeddings)
        | 'GroupByItem' >> beam.GroupByKey()
        | 'AverageItemEmbeddings' >> beam.Map(average_embedding)
        | 'ConvertToCsv' >> beam.Map(to_csv)
        | 'WriteToCloudStorage' >> beam.io.WriteToText(
            file_path_prefix = output_prefix,
            file_name_suffix = ".csv")
      )

### Run pipeline

In [None]:
runner = 'DataflowRunner'
timestamp = datetime.utcnow().strftime('%y%m%d%H%M%S')
embeddings_table_name = 'item_embeddings'
OUTPUT_DIR = f'gs://{BUCKET}/bqml/'

job_name = f'ks-bqml-export-embeddings-{timestamp}'

args = {
    'job_name': job_name,
    'runner': runner,
    'bq_dataset_name': BQ_DATASET_NAME,
    'embeddings_table_name': embeddings_table_name,
    'output_dir': OUTPUT_DIR,
    'project': PROJECT_ID,
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'region': REGION,
}

print("Pipeline args are set.")

In [None]:
args

In [None]:
if tf_io.gfile.exists(OUTPUT_DIR):
  print("Removing {} contents...".format(OUTPUT_DIR))
  tf_io.gfile.rmtree(OUTPUT_DIR)

print("Creating output: {}".format(OUTPUT_DIR))
tf_io.gfile.makedirs(OUTPUT_DIR)

print("Running pipeline...")
%time run_pipeline(args)
print("Pipeline is done.")

## License

Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

See the License for the specific language governing permissions and limitations under the License.

**This is not an official Google product but sample code provided for an educational purpose**