In [1]:
import datetime
import google.cloud.storage as storage
import numpy as np
import os
import pandas as pd
from scipy.sparse import coo_matrix

In [2]:
input_file = "../data/recommendation_events.csv"

In [3]:
views_df = pd.read_csv(input_file, sep=',', header=0)

df_items = pd.DataFrame({'contentId': views_df.contentId.unique()})
df_sorted_items = df_items.sort_values('contentId').reset_index()
pds_items = df_sorted_items.contentId

In [6]:
# preprocess data. df.groupby.agg sorts clientId and contentId
df_user_items = views_df.groupby(['clientId', 'contentId']
                              ).agg({'timeOnPage': 'sum'})


In [7]:
# create a list of (userId, itemId, timeOnPage) ratings, where userId and
# clientId are 0-indexed
current_u = -1
ux = -1
pv_ratings = []
user_ux = []
for timeonpg in df_user_items.itertuples():
user = timeonpg[0][0]
item = timeonpg[0][1]

# as we go, build a (sorted) list of user ids
if user != current_u:
  user_ux.append(user)
  ux += 1
  current_u = user

# this search makes the preprocessing time O(r * i log(i)),
# r = # ratings, i = # items
ix = pds_items.searchsorted(item)[0]
pv_ratings.append((ux, ix, timeonpg[1]))


In [8]:
pv_ratings = np.asarray(pv_ratings)

In [9]:
pv_ratings

array([[     0,   5376,  47956],
       [     0,   5433,  97658],
       [     1,   5302, 108556],
       ...,
       [ 80458,   4492, 164935],
       [ 80458,   5378,  71132],
       [ 80458,   5382,  54683]])

In [11]:
user_ux

[34348347166136L,
 124168996164483L,
 133046654287433L,
 258846262598794L,
 557312173238488L,
 599432913945238L,
 613391514250741L,
 616595597911299L,
 884133409140635L,
 903130055024958L,
 1198452002072130L,
 1235234099772916L,
 1468717114196936L,
 1529237502543689L,
 1534988451444454L,
 1614045808408162L,
 1740167641387150L,
 1858317877658516L,
 1927462524281119L,
 2055396766101911L,
 2083219530383132L,
 2099128070657556L,
 2272868139362521L,
 2304887119298103L,
 2349713654591421L,
 2397533860293383L,
 2691021847237112L,
 2879283139955641L,
 2894032080133914L,
 3058567955744598L,
 3073789284859932L,
 3293622949927381L,
 3336336378708780L,
 3365662429747528L,
 3497414855826778L,
 3527732927318520L,
 3569011960640996L,
 3711750903778581L,
 3773482447192530L,
 3822801578098033L,
 4170608024296451L,
 4305615901141331L,
 4343682317954975L,
 4374911034161644L,
 4506616206281809L,
 4580197564600473L,
 4715368796735984L,
 4768944201064376L,
 4823202512058232L,
 5165902252923723L,
 5627529619

In [12]:
user_ux = np.asarray(user_ux, dtype=np.uint64)

In [22]:
TEST_SET_RATIO = 10
def create_sparse_train_and_test(ratings, n_users, n_items):
  """Given ratings, create sparse matrices for train and test sets.

  Args:
    ratings:  list of ratings tuples  (u, i, r)
    n_users:  number of users
    n_items:  number of items

  Returns:
     train, test sparse matrices in scipy coo_matrix format.
  """
  # pick a random test set of entries, sorted ascending
  test_set_size = len(ratings) / TEST_SET_RATIO
  test_set_idx = np.random.choice(xrange(len(ratings)),
                                  size=test_set_size, replace=False)
  test_set_idx = sorted(test_set_idx)

  # sift ratings into train and test sets
  ts_ratings = ratings[test_set_idx]
  tr_ratings = np.delete(ratings, test_set_idx, axis=0)

  # create training and test matrices as coo_matrix's
  u_tr, i_tr, r_tr = zip(*tr_ratings)
  tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items), dtype=np.uint64)

  u_ts, i_ts, r_ts = zip(*ts_ratings)
  test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items), dtype=np.uint64)

  return tr_sparse, test_sparse


In [23]:
tr_sparse, test_sparse = create_sparse_train_and_test(pv_ratings,
                                                     ux + 1,
                                                     df_items.size)

In [24]:
tr_sparse

<80459x5478 sparse matrix of type '<type 'numpy.uint64'>'
	with 231061 stored elements in COOrdinate format>

In [25]:
test_sparse

<80459x5478 sparse matrix of type '<type 'numpy.uint64'>'
	with 25673 stored elements in COOrdinate format>

In [27]:
tr_sparse.data

array([ 47956,  97658, 108556, ...,   3304, 164935,  71132], dtype=uint64)

In [28]:
test_sparse.data

array([ 67104,   1117,  70738, ..., 243830,   1050,  54683], dtype=uint64)