# Item-to-item Recommendation using Cooccurrence and Matrix Factorization (prerequisite)

This tutorial shows how to use Matrix Factorization algorithm in BigQuery ML to generate embeddings for items based on their cooccurrence statistics. The generated item embeddings can be then used to find similar items.

The prerequisites cover:

1. Copy the `bigquery-samples dataset.playlists` public data to your dataset.
2. Export the songs information to Datastore.

## Setup

In [None]:
!pip install -q -U apache-beam[gcp]

### Import libraries

In [None]:
import os
from datetime import datetime
import apache_beam as beam
from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore

### Configure GCP environment settings

In [None]:
PROJECT_ID = 'ksalama-cloudml'
BUCKET = 'ksalama-cloudml'
DF_REGION = 'us-central1'

BQ_DATASET_NAME = 'recommendations'
BQ_TABLE_NAME = 'playlist'
BQ_REGION = 'EU'
DS_KIND = 'song'

!gcloud config set project $PROJECT_ID

### Authenticate your GCP account
This is required if you run the notebook in Colab

In [None]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

## Copy the BigQuery Playlists data

### Create BQ Dataset

In [None]:
!bq mk --dataset --location={BQ_REGION} {PROJECT_ID}:{BQ_DATASET_NAME}

### Implement data copying Beam pipeline

In [None]:
def run_copy_bq_data_pipeline(args):

  schema = 'list_Id:INT64, track_Id:INT64, track_title:STRING, track_artist:STRING'

  query = '''
    SELECT 
      id list_Id, 
      tracks_data_id track_Id, 
      tracks_data_title track_title,
      tracks_data_artist_name track_artist
    FROM `bigquery-samples.playlists.playlist`
    WHERE tracks_data_title IS NOT NULL AND tracks_data_id > 0
    GROUP BY list_Id, track_Id, track_title, track_artist;
  '''

  pipeline_options = beam.options.pipeline_options.PipelineOptions(**args)
  with beam.Pipeline(options=pipeline_options) as pipeline:

    _ = (
        pipeline
        | 'ReadFromBigQuery' >> beam.io.Read(beam.io.BigQuerySource(
            project=PROJECT_ID, query=query, use_standard_sql=True))
        | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            table=BQ_TABLE_NAME, dataset=BQ_DATASET_NAME, project=PROJECT_ID,
            schema=schema, 
            create_disposition='CREATE_IF_NEEDED',
            write_disposition='WRITE_TRUNCATE'
        )
    )


### Run the Beam pipeline

In [None]:
DATASET = 'playlist'
RUNNER = 'Dataflow'

job_name = f'copy-bigquery-{datetime.utcnow().strftime("%y%m%d%H%M%S")}'

args = {
    'job_name': job_name,
    'runner': RUNNER,
    'project': PROJECT_ID,
    'temp_location': f'gs://{BUCKET}/dataflow_tmp',
    'region': DF_REGION
}

print("Pipeline args are set.")

In [None]:
print("Running pipeline...")
%time run_copy_bq_data_pipeline(args)
print("Pipeline is done.")

### Create a view to abstract the source table

In [None]:
%%bigquery  --project $PROJECT_ID

CREATE OR REPLACE VIEW `recommendations.vw_item_groups`
AS
SELECT
  list_Id AS group_Id,
  track_Id AS item_Id
FROM  
  `recommendations.playlist` 

## Load the Tracks information to Datastore

### Implement Beam Pipeline

In [None]:
def create_entity(song_info, kind):

  from apache_beam.io.gcp.datastore.v1new.types import Entity
  from apache_beam.io.gcp.datastore.v1new.types import Key

  track_Id = song_info.pop("track_Id")
  key = Key([kind, track_Id])
  song_entity = Entity(key)
  song_entity.set_properties(song_info)
  return song_entity

def run_export_to_datatore_pipeline(args):

    query = f'''
      SELECT  
        track_Id, 
        MAX(track_title) track_title, 
        MAX(artist) artist
      FROM 
        `{BQ_DATASET_NAME}.{BQ_TABLE_NAME}`
      GROUP BY track_Id
    '''

    pipeline_options = beam.options.pipeline_options.PipelineOptions(**args)
    with beam.Pipeline(options=pipeline_options) as pipeline:

      _ = (
        pipeline
        | 'ReadFromBigQuery' >> beam.io.Read(beam.io.BigQuerySource(
            project=PROJECT_ID, query=query, use_standard_sql=True))
        | 'ConvertToDatastoreEntity' >> beam.Map(create_entity, DS_KIND)
        | 'WriteToDatastore' >> WriteToDatastore(project=PROJECT_ID)
      )


### Run pipeline

In [None]:
import os
from datetime import datetime

DATASET = 'playlist'
RUNNER = 'DataflowRunner'

job_name = f'load-datastore-{datetime.utcnow().strftime("%y%m%d%H%M%S")}'

args = {
    'job_name': job_name,
    'runner': RUNNER,
    'project': PROJECT_ID,
    'temp_location': f'gs://{BUCKET}/dataflow_tmp',
    'region': DF_REGION
}

print("Pipeline args are set.")

In [None]:
print("Running pipeline...")
%time run_export_to_datatore_pipeline(args)
print("Pipeline is done.")

## License

Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

See the License for the specific language governing permissions and limitations under the License.

**This is not an official Google product but sample code provided for an educational purpose**