# Item-to-item Recommendation using Cooccurrence and Matrix Factorization (Part 1)

This tutorial shows how to use Matrix Factorization algorithm in BigQuery ML to generate embeddings for items based on their cooccurrence statistics. The generated item embeddings can be then used to find similar items.

Part 1 covers the following steps:

1. Explore the BigQuery data based on the `bigquery-samples dataset.playlists` dataset.
2. Compute pairwise item cooccurrences. 
3. Train a Matrix Factorization model using BigQuery ML.
4. Explore trained embeddings.

## Setup

### Import libraries

In [None]:
from google.cloud import bigquery
from datetime import datetime
import matplotlib.pyplot as plt, seaborn as sns

### Configure GCP environment settings

In [None]:
PROJECT_ID = 'ksalama-cloudml'

!gcloud config set project $PROJECT_ID

### Authenticate your GCP account
This is required if you run the notebook in Colab

In [None]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

## Exploratory Data Analysis

In [None]:
%%bigquery  --project $PROJECT_ID

CREATE OR REPLACE TABLE recommendations.valid_items
AS
SELECT 
  item_Id, 
  COUNT(group_Id) AS item_frequency
FROM recommendations.vw_item_groups
GROUP BY item_Id
HAVING item_frequency >= 50;

SELECT COUNT(*) item_count FROM recommendations.valid_items;

In [None]:
%%bigquery  --project $PROJECT_ID

CREATE OR REPLACE TABLE recommendations.valid_groups
AS
SELECT 
  group_Id, 
  COUNT(item_Id) AS group_size
FROM recommendations.vw_item_groups
WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
GROUP BY group_Id
HAVING group_size BETWEEN 2 AND 100;

SELECT COUNT(*) item_count FROM recommendations.valid_groups;

In [None]:
%%bigquery  --project $PROJECT_ID

SELECT COUNT(*) record_count
FROM `recommendations.vw_item_groups`
WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
AND group_Id IN (SELECT group_Id FROM recommendations.valid_groups);

In [None]:
%%bigquery  --project $PROJECT_ID

SELECT 
  MIN(group_size) AS min_group_size, 
  MAX(group_size) AS max_group_size 
FROM 
(
  SELECT COUNT(item_Id) AS group_size
  FROM `recommendations.vw_item_groups`
  WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
  AND group_Id IN (SELECT group_Id FROM recommendations.valid_groups)
  GROUP BY group_Id
)


In [None]:
%%bigquery size_distribution --project $PROJECT_ID

WITH group_sizes
AS
(
  SELECT 
    group_Id, 
    ML.BUCKETIZE(
      COUNT(item_Id), [10, 20, 30, 40, 50, 100, 200])
     AS group_size
  FROM `recommendations.vw_item_groups`
  WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
  AND group_Id IN (SELECT group_Id FROM recommendations.valid_groups)
  GROUP BY group_Id
)

SELECT 
  CASE 
    WHEN group_size = 'bin_1' THEN '[1 - 10]'
    WHEN group_size = 'bin_2' THEN '[10 - 20]'
    WHEN group_size = 'bin_3' THEN '[20 - 30]'
    WHEN group_size = 'bin_4' THEN '[30 - 40]'
    WHEN group_size = 'bin_5' THEN '[40 - 50]'
    WHEN group_size = 'bin_6' THEN '[50 - 100]'
    ELSE'[100 - 200]'
  END AS group_size,
  CASE 
    WHEN group_size = 'bin_1' THEN 1
    WHEN group_size = 'bin_2' THEN 2
    WHEN group_size = 'bin_3' THEN 3
    WHEN group_size = 'bin_4' THEN 4
    WHEN group_size = 'bin_5' THEN 5
    WHEN group_size = 'bin_6' THEN 6
    ELSE 7
  END AS bucket_Id,
  COUNT(group_Id) group_count
FROM group_sizes
GROUP BY group_size, bucket_Id
ORDER BY bucket_Id 

In [None]:
plt.figure(figsize=(20,5))
q = sns.barplot(x='group_size', y='group_count', data=size_distribution)

In [None]:
%%bigquery  --project $PROJECT_ID

SELECT 
  MIN(group_count) AS min_group_count, 
  MAX(group_count) AS max_group_count
FROM 
(
  SELECT COUNT(group_Id) AS group_count
  FROM `recommendations.vw_item_groups`
  WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
  AND group_Id IN (SELECT group_Id FROM recommendations.valid_groups)
  GROUP BY item_Id
)

In [None]:
%%bigquery occurrence_distribution --project $PROJECT_ID

WITH item_frequency
AS
(
  SELECT 
    Item_Id, 
    ML.BUCKETIZE(
      COUNT(group_Id)
      , [100, 200, 300, 400]) AS group_count
  FROM `recommendations.vw_item_groups`
  WHERE item_Id IN (SELECT item_Id FROM recommendations.valid_items)
  AND group_Id IN (SELECT group_Id FROM recommendations.valid_groups)
  GROUP BY Item_Id
)


SELECT 
  CASE 
    WHEN group_count = 'bin_1' THEN '[1 - 100]'
    WHEN group_count = 'bin_2' THEN '[100 - 200]'
    WHEN group_count = 'bin_3' THEN '[200 - 300]'
    WHEN group_count = 'bin_4' THEN '[300 - 400]'
    ELSE '[400+]'
  END AS group_count,
  CASE 
    WHEN group_count = 'bin_1' THEN 1
    WHEN group_count = 'bin_2' THEN 2
    WHEN group_count = 'bin_3' THEN 3
    WHEN group_count = 'bin_4' THEN 4
    ELSE 5
  END AS bucket_Id,
  COUNT(Item_Id) item_count
FROM item_frequency
GROUP BY group_count, bucket_Id
ORDER BY bucket_Id 

In [None]:
plt.figure(figsize=(20, 5))
q = sns.barplot(x='group_count', y='item_count', data=occurrence_distribution)

In [None]:
%%bigquery --project $PROJECT_ID

DROP TABLE IF EXISTS recommendations.valid_items;

In [None]:
%%bigquery --project $PROJECT_ID

DROP TABLE IF EXISTS recommendations.valid_groups;

## Compute Item Cooccurrence

### Create compute PMI stored Procedure

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE PROCEDURE recommendations.sp_ComputePMI(
  IN min_item_frequency INT64,
  IN max_group_size INT64,
  IN negative_sample_size INT64
)

BEGIN

  DECLARE total INT64;

  # Get items with minimum frequency
  CREATE OR REPLACE TABLE recommendations.valid_item_groups
  AS

  # Create valid item set
  WITH 
  valid_items AS (
    SELECT item_Id, COUNT(group_Id) AS item_frequency
    FROM recommendations.vw_item_groups
    GROUP BY item_Id
    HAVING item_frequency >= min_item_frequency
  ),

  # Create valid group set
  valid_groups AS (
    SELECT group_Id, COUNT(item_Id) AS group_size
    FROM recommendations.vw_item_groups
    WHERE item_Id IN (SELECT item_Id FROM valid_items)
    GROUP BY group_Id
    HAVING group_size BETWEEN 2 AND max_group_size
  )

  SELECT item_Id, group_Id
  FROM recommendations.vw_item_groups
  WHERE item_Id IN (SELECT item_Id FROM valid_items)
  AND group_Id IN (SELECT group_Id FROM valid_groups);

  # Compute pairwise cooc
  CREATE OR REPLACE TABLE recommendations.item_cooc
  AS
  SELECT item1_Id, item2_Id, SUM(cooc) AS cooc
  FROM
  (
    SELECT
      a.item_Id item1_Id,
      b.item_Id item2_Id,
      1 as cooc
    FROM recommendations.valid_item_groups a
    JOIN recommendations.valid_item_groups b
    ON a.group_Id = b.group_Id
    AND a.item_Id < b.item_Id
  )
  GROUP BY  item1_Id, item2_Id;

  ###################################
  
  # Compute item frequencies
  CREATE OR REPLACE TABLE recommendations.item_frequency
  AS
  SELECT item_Id, COUNT(group_Id) AS frequency
  FROM recommendations.valid_item_groups
  GROUP BY item_Id;

  ###################################
  
  # Compute total frequency |D|
  SET total = (
    SELECT SUM(frequency)  AS total
    FROM recommendations.item_frequency
  );
  
  ###################################
  
  # Add same item frequency as cooc
  CREATE OR REPLACE TABLE recommendations.item_cooc
  AS
  SELECT item1_Id, item2_Id, cooc 
  FROM recommendations.item_cooc
  UNION ALL
  SELECT item_Id as item1_Id, item_Id AS item2_Id, frequency as item_cooc
  FROM recommendations.item_frequency;

  ###################################

  # Create negative samples
  IF negative_sample_size > 0 THEN
    CREATE OR REPLACE TABLE recommendations.item_cooc
    AS

    WITH 
    ordered_items AS (
      SELECT  ROW_NUMBER() OVER (ORDER BY frequency DESC) number, item_Id
      FROM recommendations.item_frequency
    ),

    top_items AS (
      SELECT item_Id
      FROM ordered_items
      WHERE number <= negative_sample_size
    ),

    negative_samples AS (
      SELECT
        a.item as item1_Id,
        b.item as item2_Id,
        1 as cooc
      FROM top_items a
      JOIN top_items b
      ON a.item_Id < b.item_Id
    ),

    merged AS (
      SELECT item1_Id, item2_Id, cooc
      FROM recommendations.item_cooc
      UNION ALL
      SELECT item1_Id, item2_Id, cooc
      FROM negative_samples
    )

    SELECT item1_Id, item2_Id, MAX(cooc) AS cooc
    FROM merged
    GROUP BY item1_Id, item2_Id;
  END IF;
  ###################################
  
  # Compute PMI
  CREATE OR REPLACE TABLE recommendations.item_cooc
  AS
  SELECT
    a.item1_Id,
    a.item2_Id,
    a.cooc,
    LOG(a.cooc, 2) - LOG(b.frequency, 2) - LOG(c.frequency, 2) + LOG(total, 2) AS pmi
  FROM recommendations.item_cooc a
  JOIN recommendations.item_frequency b
  ON a.item1_Id = b.item_Id
  JOIN recommendations.item_frequency c
  ON a.item2_Id = c.item_Id; 
END

### Execute the stored procedure

In [None]:
%%bigquery --project $PROJECT_ID

DECLARE min_item_frequency INT64;
DECLARE max_group_size INT64;
DECLARE negative_samples INT64;

SET min_item_frequency = 50;
SET max_group_size = 100;
SET negative_samples = 0;

CALL recommendations.sp_ComputePMI(min_item_frequency, max_group_size, negative_samples);

### View the cooccurrence data

In [None]:
%%bigquery --project $PROJECT_ID

SELECT 
  a.item1_Id, 
  a.item2_Id, 
  b.frequency AS freq1,
  c.frequency AS freq2,
  a.cooc,
  a.pmi,
  a.cooc * a.pmi AS score
FROM recommendations.item_cooc a
JOIN recommendations.item_frequency b
ON a.item1_Id = b.item_Id
JOIN recommendations.item_frequency c 
ON a.item2_Id = c.item_Id
WHERE a.item1_Id != a.item2_Id
ORDER BY score DESC
LIMIT 10;

In [None]:
%%bigquery --project $PROJECT_ID

SELECT COUNT(*) records_count 
FROM recommendations.item_cooc

## Train the BigQuery ML Matrix Factorization Model

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE PROCEDURE recommendations.sp_TrainEmbeddingModel(
  IN dimensions INT64
)

BEGIN

  CREATE OR REPLACE MODEL recommendations.item_embedding_model
  OPTIONS(
    MODEL_TYPE='matrix_factorization', 
    FEEDBACK_TYPE='implicit',
    WALS_ALPHA=1,
    NUM_FACTORS=(dimensions),
    USER_COL='item1_Id', 
    ITEM_COL='item2_Id',
    RATING_COL='target',
    DATA_SPLIT_METHOD='no_split'
  )
  AS
  SELECT 
    item1_Id, 
    item2_Id, 
    cooc AS target
  FROM recommendations.item_cooc;

END

In [None]:
%%bigquery --project $PROJECT_ID

DECLARE dimensions INT64 DEFAULT 50;
CALL recommendations.sp_TrainEmbeddingModel(dimensions)

## Explore the trained embeddings

In [None]:
songs = {
    '2114406': 'Metallica: Nothing Else Matters',
    '2114402': 'Metallica: The Unforgiven',
    '2120788': 'Limp Bizkit: My Way',
    '2120786': 'Limp Bizkit: My Generation',
    '1086322': 'Jacques Brel: Ne Me Quitte Pas',
    '3129954': 'Édith Piaf: Non, Je Ne Regrette Rien',
    '53448': 'France Gall: Ella, Elle l\'a',
    '887688': 'Enrique Iglesias: Tired Of Being Sorry',
    '562487': 'Shakira: Hips Don\'t Lie',
    '833391': 'Ricky Martin: Livin\' la Vida Loca',
    '1098069': 'Snoop Dogg: Drop It Like It\'s Hot',
    '910683': '2Pac: California Love',
    '1579481': 'Dr. Dre: The Next Episode',
    '2675403': 'Eminem: Lose Yourself',
    '2954929': 'Black Sabbath: Iron Man',
    '625169': 'Black Sabbath: Paranoid',
}

#tuple(songs.keys())


In [None]:
%%bigquery song_embeddings --project $PROJECT_ID

SELECT 
  feature,
  processed_input,
  factor_weights,
  intercept
FROM
  ML.WEIGHTS(MODEL recommendations.item_embedding_model)
WHERE 
  feature IN ('2114406',
              '2114402',
              '2120788',
              '2120786',
              '1086322',
              '3129954',
              '53448',
              '887688',
              '562487',
              '833391',
              '1098069',
              '910683',
              '1579481',
              '2675403',
              '2954929',
              '625169')

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def process_results(results):
  items = list(results['feature'].unique())
  item_embeddings = dict()
  for item in items:
    emebedding = [0.0]* 100
    embedding_pair = results[results['feature'] == item]

    for _, row in embedding_pair.iterrows():
      factor_weights = list(row['factor_weights'])
      for _, element in enumerate(factor_weights):
        emebedding[element['factor'] - 1] += element['weight']

    for idx in range(len(emebedding)):
      emebedding[idx] /= len(embedding_pair)

    item_embeddings[item] = emebedding
    
  return item_embeddings

In [None]:
item_embeddings = process_results(song_embeddings)

In [None]:
item_ids = list(item_embeddings.keys())
for idx1 in range(0, len(item_ids) - 1):
  item1_Id = item_ids[idx1]
  title1 = songs[item1_Id]
  print(title1)
  print("==================")
  embedding1 = np.array(item_embeddings[item1_Id])
  similar_items = []
  for idx2 in range(len(item_ids)):
    item2_Id = item_ids[idx2]
    title2 = songs[item2_Id]
    embedding2 = np.array(item_embeddings[item2_Id])
    similarity = round(cosine_similarity([embedding1], [embedding2])[0][0], 5)
    similar_items.append((title2, similarity))
  
  similar_items = sorted(similar_items, key=lambda item: item[1], reverse=True)
  for element in similar_items[1:]:
    print(f"- {element[0]}' = {element[1]}")
  print()

## License

Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

See the License for the specific language governing permissions and limitations under the License.

**This is not an official Google product but sample code provided for an educational purpose**