In [3]:
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip

In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [5]:
review_data = getDF('data/reviews_Video_Games.json.gz')

# cast as 32 float to save space
review_data['overall'] = review_data['overall'].astype('float32')

In [8]:
# remove all unnessacary columns since this is CF method 
del review_data['reviewerName']
del review_data['helpful']
del review_data['reviewText']
del review_data['summary']
del review_data['unixReviewTime']
del review_data['reviewTime']

review_data.head()

Unnamed: 0,reviewerID,asin,overall
0,AB9S9279OZ3QO,0078764343,5.0
1,A24SSUT5CSW8BH,0078764343,5.0
2,AK3V0HEBJMQ7J,0078764343,4.0
3,A10BECPH7W8HM7,043933702X,5.0
4,A2PRV9OULX1TWP,043933702X,5.0


In [9]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324753 entries, 0 to 1324752
Data columns (total 3 columns):
reviewerID    1324753 non-null object
asin          1324753 non-null object
overall       1324753 non-null float32
dtypes: float32(1), object(2)
memory usage: 35.4+ MB


In [10]:
review_data.describe()

Unnamed: 0,overall
count,1324753.0
mean,3.978754
std,1.374263
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [17]:
max_user  = len(review_data.reviewerID.unique().tolist())
#len(gapminder['country'].unique().tolist())
max_item = len(review_data.asin.unique().tolist())
max_user, max_item, max_user * max_item

(826767, 50210, 41511971070)

In [21]:
print("%.4f%%" % (100 * review_data.shape[0] / (max_user * max_item)))

0.0032%


In [28]:
# reset reviewer ID to int col so that conversion to matrix can be done
review_data['ID'] = review_data.reviewerID.astype('category').cat.rename_categories(range(1, review_data.reviewerID.nunique()+1))

In [30]:
review_data['itemID'] = review_data.asin.astype('category').cat.rename_categories(range(1, review_data.asin.nunique()+1))

Unnamed: 0,reviewerID,asin,overall,ID,itemID
0,AB9S9279OZ3QO,0078764343,5.0,676630,1
1,A24SSUT5CSW8BH,0078764343,5.0,248237,1
2,AK3V0HEBJMQ7J,0078764343,4.0,730297,1
3,A10BECPH7W8HM7,043933702X,5.0,3165,2
4,A2PRV9OULX1TWP,043933702X,5.0,375823,2


In [31]:
del review_data['reviewerID']
del review_data['asin']

review_data.head()

Unnamed: 0,overall,ID,itemID
0,5.0,676630,1
1,5.0,248237,1
2,4.0,730297,1
3,5.0,3165,2
4,5.0,375823,2


In [40]:
review_data['ID'] = review_data['ID'].astype('int64')
review_data['itemID'] = review_data['itemID'].astype('int64')

review_data.dtypes
max_user  = int(review_data["ID"].max() + 1)
max_item = int(review_data["itemID"].max() + 1)
max_user, max_item, max_user * max_item

(826768, 50211, 41512848048)

In [41]:
import sklearn.model_selection

x_train, x_test = sklearn.model_selection.train_test_split(review_data, test_size=0.25, random_state=12345678)

In [42]:
import scipy.sparse

def df2mat(df):
    m = scipy.sparse.coo_matrix(
        (df["overall"], (df["ID"], df["itemID"])),
        shape=(max_user, max_item),
        dtype=np.float32).tocsc()
    return m, m > 0

x_mat_train, x_mask_train = df2mat(x_train)
x_mat_test,  x_mask_test  = df2mat(x_test)

In [43]:
x_mat_train

<826768x50211 sparse matrix of type '<class 'numpy.float32'>'
	with 993564 stored elements in Compressed Sparse Column format>

In [45]:
user_ids = list(x_train.iloc[:10].ID)
item_ids = list(x_train.iloc[:10].itemID)
[x_mat_train[u,i] for u,i in zip(user_ids, item_ids)]

[4.0, 4.0, 2.0, 4.0, 4.0, 3.0, 1.0, 2.0, 4.0, 5.0]

In [47]:
class SVDModel(object):
    def __init__(self, num_items, num_users, mean,
                 num_factors = 100, init_variance = 0.1):
        self.mu = mean
        self.num_items = num_items
        self.num_users = num_users
        self.num_factors = num_factors
        #dtype = np.float32
        # Deviations, per-item:
        self.b_i = np.zeros((num_items,), dtype=np.float32)
        # Deviations; per-user:
        self.b_u = np.zeros((num_users,), dtype=np.float32)
        # Factor matrices:
        self.q = (np.random.randn(num_factors, num_items) * init_variance)#.astype(dtype=np.float32)
        self.p = (np.random.randn(num_factors, num_users) * init_variance)#.astype(dtype=np.float32)
        # N.B. row I of q is item I's "concepts", so to speak;
        # column U of p is how much user U belongs to each "concept"
    
    def predict(self, items, users):
        """Returns rating prediction for specific items and users.

        Parameters:
        items -- 1D array of item IDs
        users -- 1D array of user IDs (same length as :items:)
        
        Returns:
        ratings -- 1D array of predicted ratings (same length as :items:)
        """
        # Note that we don't multiply p & q like matrices here,
        # but rather, we just do row-by-row dot products.
        # Matrix multiply would give us every combination of item and user,
        # which isn't what we want.
        return self.mu + \
               self.b_i[items] + \
               self.b_u[users] + \
               (self.q[:, items] * self.p[:, users]).sum(axis=0)
    
    def error(self, items, users, ratings, batch_size=256):
        """Predicts over the given items and users, compares with the correct
        ratings, and returns RMSE and MAE.
        
        Parameters:
        items -- 1D array of item IDs
        users -- 1D array of user IDs (same length as :items:)
        ratings -- 1D array of 'correct' item ratings (same length as :items:)
        
        Returns:
        rmse, mae -- Scalars for RMS error and mean absolute error
        """
        sqerr = 0
        abserr = 0
        for i0 in range(0, len(items), batch_size):
            i1 = min(i0 + batch_size, len(items))
            p = self.predict(items[i0:i1], users[i0:i1])
            d = p - ratings[i0:i1]
            sqerr += np.square(d).sum()
            abserr += np.abs(d).sum()
        rmse = np.sqrt(sqerr / items.size)
        mae = abserr / items.size
        return rmse, mae
    
    def update_by_gradient(self, i, u, r_ui, lambda_, gamma):
        """Perform a single gradient-descent update."""
        e_ui = r_ui - self.predict(i, u)
        dbi = gamma * (e_ui - lambda_ * self.b_u[u])
        dbu = gamma * (e_ui - lambda_ * self.b_i[i])
        dpu = gamma * (e_ui * self.q[:,i] - lambda_ * self.p[:, u])
        dqi = gamma * (e_ui * self.p[:,u] - lambda_ * self.q[:, i])
        self.b_i[i] += dbi
        self.b_u[u] += dbu
        self.p[:,u] += dpu
        self.q[:,i] += dqi
        
    def train(self, items, users, ratings, gamma = 0.005, lambda_ = 0.02,
              num_epochs=20, epoch_callback=None):
        """Train with stochastic gradient-descent"""
        for epoch in range(num_epochs):
            t0 = time.time()
            total = 0
            for idx in np.random.permutation(len(items)):
                d = 2000000
                if (idx > 0 and idx % d == 0):
                    total += d
                    dt = time.time() - t0
                    rate = total / dt
                    sys.stdout.write("{:.0f}/s ".format(rate))
                i, u, r_ui = items[idx], users[idx], ratings[idx]
                self.update_by_gradient(i, u, r_ui, lambda_, gamma)
            if epoch_callback: epoch_callback(self, epoch, num_epochs)

In [51]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324753 entries, 0 to 1324752
Data columns (total 3 columns):
overall    1324753 non-null float32
ID         1324753 non-null int64
itemID     1324753 non-null int64
dtypes: float32(1), int64(2)
memory usage: 35.4 MB


In [46]:
# Hyperparameters
gamma = 0.002
lambda_ = 0.02
num_epochs = 20
num_factors = 40

In [48]:
item_train = x_train["itemID"].values
users_train = x_train["ID"].values
ratings_train = x_train["overall"].values
item_test = x_test["itemID"].values
users_test = x_test["ID"].values
ratings_test = x_test["overall"].values

def at_epoch(self, epoch, num_epochs):
    train_rmse, train_mae = self.error(item_train, users_train, ratings_train)
    test_rmse, test_mae = self.error(item_test, users_test, ratings_test)
    np.savez_compressed("svd{}".format(num_factors),
                        (self.b_i, self.b_u, self.p, self.q))
    print()
    print("Epoch {:02d}/{}; Training: MAE={:.3f} RMSE={:.3f}, Testing: MAE={:.3f} RMSE={:.3f}".format(epoch + 1, num_epochs, train_mae, train_rmse, test_mae, test_rmse))

In [50]:
svd = SVDModel(max_item, max_user, review_data["overall"].mean(), num_factors=num_factors)
svd.train(item_train, users_train, ratings_train, epoch_callback=at_epoch)


Epoch 01/20; Training: MAE=1.015 RMSE=1.285, Testing: MAE=1.027 RMSE=1.300

Epoch 02/20; Training: MAE=0.991 RMSE=1.256, Testing: MAE=1.014 RMSE=1.285

Epoch 03/20; Training: MAE=0.973 RMSE=1.234, Testing: MAE=1.006 RMSE=1.277

Epoch 04/20; Training: MAE=0.957 RMSE=1.216, Testing: MAE=1.000 RMSE=1.271

Epoch 05/20; Training: MAE=0.943 RMSE=1.200, Testing: MAE=0.996 RMSE=1.267

Epoch 06/20; Training: MAE=0.930 RMSE=1.184, Testing: MAE=0.993 RMSE=1.264

Epoch 07/20; Training: MAE=0.918 RMSE=1.169, Testing: MAE=0.991 RMSE=1.262

Epoch 08/20; Training: MAE=0.906 RMSE=1.154, Testing: MAE=0.989 RMSE=1.260

Epoch 09/20; Training: MAE=0.893 RMSE=1.138, Testing: MAE=0.987 RMSE=1.258

Epoch 10/20; Training: MAE=0.880 RMSE=1.122, Testing: MAE=0.985 RMSE=1.257

Epoch 11/20; Training: MAE=0.866 RMSE=1.104, Testing: MAE=0.984 RMSE=1.256

Epoch 12/20; Training: MAE=0.853 RMSE=1.087, Testing: MAE=0.984 RMSE=1.256

Epoch 13/20; Training: MAE=0.837 RMSE=1.068, Testing: MAE=0.983 RMSE=1.256

Epoch 14/20