# Collaborative filtering on Google Analytics data

This notebook demonstrates how to implement a WALS matrix refactorization approach to do collaborative filtering.

In [9]:
import os
PROJECT = 'cloud-training-demos' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'cloud-training-demos-ml' # REPLACE WITH YOUR BUCKET NAME
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

# do not change these
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [None]:
%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

## Create raw dataset
<p>
For collaborative filtering, we don't need to know anything about either the users or the content. Essentially, all we need to know is userId, itemId, and rating that the particular user gave the particular item.
<p>
In this case, we are working with newspaper articles. The company doesn't ask their users to rate the articles. However, we can use the time-spent on the page as a proxy for rating.
<p>
Normally, we would also add a time filter to this ("latest 7 days"), but our dataset is itself limited to a few days.

In [None]:
import google.datalab.bigquery as bq

sql="""
#standardSQL
WITH visitor_page_content AS (

   SELECT  
     fullVisitorID,
     (SELECT MAX(IF(index=10, value, NULL)) FROM UNNEST(hits.customDimensions)) AS latestContentId,  
     (LEAD(hits.time, 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) - hits.time) AS session_duration 
   FROM `GA360_test.ga_sessions_sample`,   
     UNNEST(hits) AS hits
   WHERE 
     # only include hits on pages
      hits.type = "PAGE"

   GROUP BY   
     fullVisitorId, latestContentId, hits.time
     )

# aggregate web stats
SELECT   
  fullVisitorID as visitorId,
  latestContentId as contentId,
  SUM(session_duration) AS session_duration 
 
FROM visitor_page_content
  WHERE latestContentId IS NOT NULL 
  GROUP BY fullVisitorID, latestContentId
  HAVING session_duration > 0
  ORDER BY latestContentId 
"""

df = bq.Query(sql).execute().result().to_dataframe()
df.head()

In [None]:
stats = df.describe()
stats

In [None]:
# the rating is the session_duration scaled to be in the range 0-1.  This will help with training.
df['rating'] = 0.3 * (1 + (df['session_duration'] - stats.loc['50%', 'session_duration'])/stats.loc['50%', 'session_duration'])
df.loc[df['rating'] > 1, 'rating'] = 1
df.describe()

In [None]:
del df['session_duration']

In [None]:
%bash
rm -rf data
mkdir data

In [None]:
df.to_csv('data/collab_raw.csv', index=False, header=False)

In [None]:
!head data/collab_raw.csv

## Create dataset for WALS
<p>
The raw dataset (above) won't work for WALS:
<ol>
<li> The userId and itemId have to be 0,1,2 ... so we need to create a mapping from visitorId (in the raw data) to userId and contentId (in the raw data) to itemId.
<li> We will need to save the above mapping to a file because at prediction time, we'll need to know how to map the contentId in the table above to the itemId.
<li> We'll need two files: a "rows" dataset where all the items for a particular user are listed; and a "columns" dataset where all the users for a particular item are listed.
</ol>

<p>

### Mapping

In [None]:
import pandas as pd
import numpy as np
def create_mapping(values, filename):
  with open(filename, 'w') as ofp:
    value_to_id = {value:idx for idx, value in enumerate(values.unique())}
    for value, idx in value_to_id.items():
      ofp.write('{},{}\n'.format(value, idx))
  return value_to_id

df = pd.read_csv('data/collab_raw.csv',
                 header=None,
                 names=['visitorId', 'contentId', 'rating'],
                dtype={'visitorId': str, 'contentId': str, 'rating': np.float})
df.to_csv('data/collab_raw.csv', index=False, header=False)
user_mapping = create_mapping(df['visitorId'], 'data/users.csv')
item_mapping = create_mapping(df['contentId'], 'data/items.csv')

In [None]:
!head -3 data/*.csv

In [None]:
df['userId'] = df['visitorId'].map(user_mapping.get)
df['itemId'] = df['contentId'].map(item_mapping.get)

In [None]:
mapped_df = df[['userId', 'itemId', 'rating']]
mapped_df.to_csv('data/collab_mapped.csv', index=False, header=False)
mapped_df.head()

### Creating rows and columns datasets

In [3]:
import pandas as pd
import numpy as np
mapped_df = pd.read_csv('data/collab_mapped.csv', header=None, names=['userId', 'itemId', 'rating'])
mapped_df.head()

Unnamed: 0,userId,itemId,rating
0,0,0,0.231208
1,1,1,1.0
2,2,2,0.16626
3,3,2,0.247218
4,4,3,0.054431


In [4]:
NITEMS = np.max(mapped_df['itemId']) + 1
NUSERS = np.max(mapped_df['userId']) + 1
mapped_df['rating'] = np.round(mapped_df['rating'].values, 2)
print '{} items, {} users, {} interactions'.format( NITEMS, NUSERS, len(mapped_df) )

5668 items, 82802 users, 278914 interactions


In [5]:
grouped_by_items = mapped_df.groupby('itemId')
iter = 0
for item, grouped in grouped_by_items:
  print item, grouped['userId'].values, grouped['rating'].values
  iter = iter + 1
  if iter > 5:
    break

0 [0] [0.23]
1 [1] [1.]
2 [2 3] [0.17 0.25]
3 [4] [0.05]
4 [5] [0.23]
5 [6] [0.95]


In [6]:
import tensorflow as tf
grouped_by_items = mapped_df.groupby('itemId')
with tf.python_io.TFRecordWriter('data/users_for_item') as ofp:
  for item, grouped in grouped_by_items:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[item])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['userId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())      

In [6]:
grouped_by_users = mapped_df.groupby('userId')
with tf.python_io.TFRecordWriter('data/items_for_user') as ofp:
  for user, grouped in grouped_by_users:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[user])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['itemId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())      

In [7]:
grouped_by_users = mapped_df.groupby('userId')
N = 0
with tf.python_io.TFRecordWriter('data/items_for_user_subset') as ofp:
  for user, grouped in grouped_by_users:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[user])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['itemId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())    
    N = N + 1
    if N > 20:
      break

In [8]:
!ls -lrt data

total 31828
-rw-r--r-- 1 root root 13114258 Feb 12 20:03 collab_raw.csv
-rw-r--r-- 1 root root  2131923 Feb 12 20:03 users.csv
-rw-r--r-- 1 root root    82193 Feb 12 20:03 items.csv
-rw-r--r-- 1 root root  7787009 Feb 12 20:03 collab_mapped.csv
-rw-r--r-- 1 root root  7207031 Feb 13 06:34 items_for_user
-rw-r--r-- 1 root root       26 Feb 13 17:02 input.json
-rw-r--r-- 1 root root  2244886 Feb 13 19:53 users_for_item
-rw-r--r-- 1 root root     1747 Feb 13 19:54 items_for_user_subset


To summarize, we created the following data files from collab_raw.csv:
<ol>
<li> ```collab_mapped.csv``` is essentially the same data as in ```collab_raw.csv``` except that ```visitorId``` and ```contentId``` which are business-specific have been mapped to ```userId``` and ```itemId``` which are enumerated in 0,1,2,....  The mappings themselves are stored in ```items.csv``` and ```users.csv``` so that they can be used during inference.
<li> ```users_for_item``` contains all the users/ratings for each item in TFExample format
<li> ```items_for_user``` contains all the items/ratings for each user in TFExample format
</ol>

## Train with WALS

Once you have the dataset, do matrix factorization with WALS using the [WALSMatrixFactorization](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/factorization/WALSMatrixFactorization) in the contrib directory.
This is an estimator model, so it should be relatively familiar.
<p>
As usual, we write an input_fn to provide the data to the model, and then create the Estimator to do train_and_evaluate.
Because it is in contrib and hasn't moved over to tf.estimator yet, we use tf.contrib.learn.Experiment to handle the training loop.

In [10]:
import os
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from tensorflow.contrib.factorization import WALSMatrixFactorization
  
def read_dataset(mode, args):
  def decode_example(protos, vocab_size):
    features = {'key': tf.FixedLenFeature([1], tf.int64),
                'indices': tf.VarLenFeature(dtype=tf.int64),
                'values': tf.VarLenFeature(dtype=tf.float32)}
    parsed_features = tf.parse_single_example(protos, features)
    keys = parsed_features['key']
    values = tf.sparse_merge(parsed_features['indices'], parsed_features['values'], vocab_size=vocab_size)
    return values

    
  def parse_tfrecords(filename, vocab_size):
    if mode == tf.estimator.ModeKeys.TRAIN:
        num_epochs = None # indefinitely
    else:
        num_epochs = 1 # end-of-input after this
    
    files = tf.gfile.Glob(os.path.join(args['input_path'], filename))
    
    # Create dataset from file list
    dataset = tf.data.TFRecordDataset(files)
    dataset = dataset.map(lambda x: decode_example(x, vocab_size))
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(args['batch_size'])
    return dataset.make_one_shot_iterator().get_next()
  
  def _input_fn():
    features = {
      WALSMatrixFactorization.INPUT_ROWS: parse_tfrecords('items_for_user', args['nitems']),
      WALSMatrixFactorization.INPUT_COLS: parse_tfrecords('users_for_item', args['nusers']),
      WALSMatrixFactorization.PROJECT_ROW: tf.constant(True)
    }
    return features, None
  
  def input_cols():
    return parse_tfrecords('users_for_item', args['nusers'])
  
  return _input_fn

In [15]:
def find_top_k(user, item_factors, k):
  all_items = tf.matmul(tf.expand_dims(user, 0), tf.transpose(item_factors))
  topk = tf.nn.top_k(all_items, k=k)
  return tf.cast(topk.indices, dtype=tf.int64)
    
def batch_predict(args):
  import numpy as np
  with tf.Session() as sess:
    estimator = tf.contrib.factorization.WALSMatrixFactorization(
                         num_rows=args['nusers'], num_cols=args['nitems'],
                         embedding_dimension=args['n_embeds'],
                         model_dir=args['output_dir'])
    # this is how you would get the row factors for out-of-vocab user data
    #row_factors = list(estimator.get_projections(input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args)))
    #user_factors = tf.convert_to_tensor(np.array(row_factors))
    
    # but for in-vocab data, the row factors are already in the checkpoint
    user_factors = tf.convert_to_tensor(estimator.get_row_factors()[0]) # (nusers, nembeds)
    # in either case, we have to assume catalog doesn't change, so col_factors are read in
    item_factors = tf.convert_to_tensor(estimator.get_col_factors()[0])# (nitems, nembeds)
    
    # for each user, find the top K items
    topk = tf.squeeze(tf.map_fn(lambda user: find_top_k(user, item_factors, args['topk']), user_factors, dtype=tf.int64))
    with file_io.FileIO(os.path.join(args['output_dir'], 'batch_pred.txt'), mode='w') as f:
      for best_items_for_user in topk.eval():
        f.write(','.join(str(x) for x in best_items_for_user) + '\n')

def train_and_evaluate(args):
    train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['nusers']) / args['batch_size'])
    steps_in_epoch = int(0.5 + args['nusers'] / args['batch_size'])
    print('Will train for {} steps, evaluating once every {} steps'.format(train_steps, steps_in_epoch))
    def experiment_fn(output_dir):
        return tf.contrib.learn.Experiment(
            tf.contrib.factorization.WALSMatrixFactorization(
                         num_rows=args['nusers'], num_cols=args['nitems'],
                         embedding_dimension=args['n_embeds'],
                         model_dir=args['output_dir']),
            train_input_fn=read_dataset(tf.estimator.ModeKeys.TRAIN, args),
            eval_input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args),
            train_steps=train_steps,
            eval_steps=1,
            min_eval_frequency=steps_in_epoch
        )

    from tensorflow.contrib.learn.python.learn import learn_runner
    learn_runner.run(experiment_fn, args['output_dir'])
    
    batch_predict(args)

In [16]:
import shutil
shutil.rmtree('wals_trained', ignore_errors=True)
train_and_evaluate({
    'output_dir': 'wals_trained',
    'input_path': 'data/',
    'num_epochs': 0.05,
    'nitems': 5668,
    'nusers': 82802,

    'batch_size': 512,
    'n_embeds': 10,
    'topk': 3
  })

Will train for 8 steps, evaluating once every 161 steps
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc90cd43b10>, '_model_dir': 'wals_trained', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': ''}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:SweepHook running init op.
INFO:tensorflow:SweepHook running prep ops for the row sweep.
INFO:tensorflow:Next fit step starting.
INFO:tensorflow:Saving checkpoints for 1 into wals_trained/model.ckpt.
INFO:tens

In [17]:
!ls wals_trained

batch_pred.txt			  model.ckpt-1.meta
checkpoint			  model.ckpt-8.data-00000-of-00001
graph.pbtxt			  model.ckpt-8.index
model.ckpt-1.data-00000-of-00001  model.ckpt-8.meta
model.ckpt-1.index


In [18]:
!head wals_trained/batch_pred.txt

701,2470,1595
5315,3753,4830
4882,1694,2492
3889,3161,1674
5495,3108,2990
1595,1631,1074
701,3889,2050
701,1595,1023
3625,4052,592
1646,5350,592


## Run as a Python module

Let's run it as Python module for just a few steps.

In [1]:
%bash
rm -rf wals.tar.gz wals_trained
export PYTHONPATH=${PYTHONPATH}:${PWD}/wals
python -m trainer.task \
   --output_dir=${PWD}/wals_trained \
   --input_path=${PWD}/data \
   --num_epochs=0.01 --nitems=5668 --nusers=82802 \
   --job-dir=./tmp

Will train for 2 steps, evaluating once every 162 steps


  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd177fb62d0>, '_model_dir': '/content/10_recommend/wals_trained', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': ''}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
INFO:tensorflow:Create CheckpointSaverHook.
2018-02-14 08:07:41.578560: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instruc

## Run on Cloud

In [None]:
%bash
gsutil -m cp data/* gs://${BUCKET}/wals/data

In [None]:
%bash
OUTDIR=gs://${BUCKET}/wals/model_trained
JOBNAME=wals_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name=trainer.task \
   --package-path=${PWD}/wals/trainer \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://$BUCKET \
   --scale-tier=BASIC_GPU \
   --runtime-version=1.4 \
   -- \
   --output_dir=$OUTDIR \
   --input_path=gs://${BUCKET}/wals/data \
   --num_epochs=10 --nitems=5668 --nusers=82802 

This took <b>10 minutes</b> and finished with a loss of 23418.4. (FIXME: what does the loss represent?)

## Deploy and predict

Because this is a SavedModel, deploying and predicting should be familiar.

In [34]:
%writefile data/input.json
{"userId": 4}

Overwriting data/input.json


In [35]:
%bash
MODEL_DIR=$(ls wals_trained/export/Servo | tail -1)
gcloud ml-engine local predict --model-dir=wals_trained/export/Servo/$MODEL_DIR --json-instances=data/input.json 

PROJECTION
[0.0005913605564273894, -0.0002621609310153872, -0.00025849349913187325, 5.232850526226684e-07, 3.793570431298576e-05, -1.6488798792124726e-05, -0.0003268149448558688, -9.341105760540813e-05, -0.001341522205621004, -0.00027549342485144734]


  from ._conv import register_converters as _register_converters
2018-02-14 06:17:58.120315: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA



In [20]:
%bash
MODEL_DIR=$(ls wals_trained/export/Servo | tail -1)
gcloud ml-engine local predict --model-dir=wals_trained/export/Servo/$MODEL_DIR --json-instances=data/input.json 

PROJECTION
[-0.001112021622247994, -0.015211464837193489, -0.005952110514044762, 0.0032768254168331623, 0.0006867538904771209, -0.0005796286859549582, 0.003464647801592946, 0.006780658382922411, 0.0038679693825542927, 0.009157896041870117]


  from ._conv import register_converters as _register_converters
2018-02-13 17:02:10.671249: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA



<pre>
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
</pre>