# Testing AutoRec

Original paper

In [1]:
# from Library.Autorec.data_preprocessor import *
# from Library.Autorec.AutoRec import AutoRec

import tensorflow as tf
import time
import argparse
import numpy as np
import pandas as pd

from tqdm import tqdm
from ipywidgets import FloatProgress, IntProgress
from IPython.display import display

current_time = time.time()

## Data preprocessor

In [2]:
from scipy.sparse import csr_matrix

def mask_if_not_zero(matrix):
    
    nonzero_idx = matrix.nonzero()
    keep = np.arange(len(nonzero_idx[0]))
    n_keep = len(keep)

    mask_csr = csr_matrix((np.ones(n_keep), (nonzero_idx[0][keep], nonzero_idx[1][keep])), shape=matrix.shape)
    return mask_csr

In [3]:
from scipy.sparse import lil_matrix, csr_matrix

def train_test_split(data, train_ratio, random_state=55):
    np.random.seed(random_state)

    # Initialize
    train = data.copy()
    test = lil_matrix(data.shape)
    test_users_idx = set()
    test_items_idx = set()
    
    # Usable rating for train / test
    nonzero = data.nonzero()
    n_nonzero = len(nonzero[0])
    
    n_train = int(train_ratio * n_nonzero)
    n_test = n_nonzero - n_train
    
    sampled_idx = np.random.choice(np.arange(n_nonzero), size=n_test, replace=False)
    
    # Create train set
    train_users_idx = set(np.arange(data.shape[0]))
    train_items_idx = set(np.arange(data.shape[1]))


    for idx in tqdm(sampled_idx):

        row = nonzero[0][idx]
        col = nonzero[1][idx]
        
        # Modify matrix
        test[row, col] = train[row, col]        
        train[row, col] = 0 
        
        # Add into test set
        test_users_idx.add(row)
        test_items_idx.add(col)

    return train, test, n_train, n_test, \
           train_users_idx, train_items_idx, \
           test_users_idx, test_items_idx

In [4]:
import numpy as np
import pandas as pd
import pickle
from scipy import sparse

def read_rating(path, train_ratio):
    
    filename = path + "/ratings.csr"
    R = pickle.load(open(filename, 'rb'))
    
    train_R, test_R, n_train_R, n_test_R, train_users_idx, train_items_idx, test_users_idx, test_items_idx = train_test_split(R, train_ratio=train_ratio, random_state=55)
    
    mask_R = mask_if_not_zero(R)
    train_mask_R = mask_if_not_zero(train_R)
    test_mask_R = mask_if_not_zero(test_R)

    return  R, mask_R, train_R, train_mask_R, test_R, test_mask_R, n_train_R, n_test_R, train_users_idx, train_items_idx, test_users_idx, test_items_idx

## Argument

In [5]:
class Args:
    
    def __init__(self):
        self.hidden_neuron = 1000
        self.lambda_value = 1.0
        
        self.train_epoch = 200
        self.batch_size = 2048
        
        self.optimizer_method = 'Adam' # 'Adam','RMSProp'
        self.grad_clip = False
        self.base_lr = 0.003
        self.decay_epoch_step = 50
        
        self.random_seed = 1000
        self.display_step = 1
        self.save_step = 5
        
    def __str__(self):
        return "hidden_neuron : {}\nlambda_value : {}\ntrain_epoch : {}\nbatch_size : {}\noptimizer_method : {}\ngrad_clip : {}\nbase_lr : {}\ndecay_epoch_step : {}\nrandom_seed : {}\ndisplay_step : {}\n".format(
            self.hidden_neuron,
            self.lambda_value,
            self.train_epoch,
            self.batch_size,
            self.optimizer_method,
            self.grad_clip,
            self.base_lr,
            self.decay_epoch_step,
            self.random_seed,
            self.display_step,
            self.save_step)

args = Args()

## Logger

import pickle
import os

class Logger:

    def set_default_filename(self, filename):
        self.default_filename = filename

    def create_session_folder(self, path):
        try:  
            os.makedirs(path)
        except OSError:  
            print ("Creation of the directory %s failed" % path)
        else:  
            print ("     ===> Successfully created the directory %s \n" % path)

    def log(self, text):
        with open(self.default_filename, 'a') as f:
            f.writelines(text)
            f.write("\n")

    def save_model(self, model, filename):
        pickle.dump(model, open(filename, 'wb'))

## Model

import tensorflow as tf
import time
import numpy as np
import os
import math
from datetime import datetime
from scipy.sparse import lil_matrix, csr_matrix, vstack
from tqdm import tqdm

class AutoRec():
    def __init__(self, sess, args,
                 num_users, num_items,
                 R, mask_R, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings,
                 user_train_set, item_train_set, user_test_set, item_test_set):

        self.sess = sess
        self.args = args

        self.num_users = num_users
        self.num_items = num_items

        self.R = R
        self.mask_R = mask_R
        self.train_R = train_R
        self.train_mask_R = train_mask_R
        self.test_r = test_R
        self.test_mask_R = test_mask_R
        self.num_train_ratings = num_train_ratings
        self.num_test_ratings = num_test_ratings

        self.user_train_set = user_train_set
        self.item_train_set = item_train_set
        self.user_test_set = user_test_set
        self.item_test_set = item_test_set

        self.hidden_neuron = args.hidden_neuron
        self.train_epoch = args.train_epoch
        self.batch_size = args.batch_size
        self.num_batch = int(
            math.ceil(self.num_users / float(self.batch_size)))

        self.base_lr = args.base_lr
        self.optimizer_method = args.optimizer_method
        self.display_step = args.display_step
        self.random_seed = args.random_seed

        self.global_step = tf.Variable(0, trainable=False)
        self.decay_epoch_step = args.decay_epoch_step
        self.decay_step = self.decay_epoch_step * self.num_batch
        self.lr = tf.train.exponential_decay(self.base_lr, self.global_step,
                                             self.decay_step, 0.96, staircase=True)
        self.lambda_value = args.lambda_value

        self.train_cost_list = []
        self.test_cost_list = []
        self.test_rmse_list = []

        self.grad_clip = args.grad_clip
        
        self.timestamp = str(datetime.timestamp(datetime.now()))
        self.logger = Logger()
        self.session_log_path = "../log/{}/".format(self.timestamp)
        self.logger.create_session_folder(self.session_log_path)
        self.logger.set_default_filename(self.session_log_path + "log.txt")

    def run(self):
        
        # Log parameters
        self.logger.log(str(self.args))
        self.prepare_model()
        
        init = tf.global_variables_initializer()
        self.sess.run(init)
        
        for epoch_itr in (range(self.train_epoch)):
            
            self.train_model(epoch_itr)
            self.test_model(epoch_itr)
            
            # Save the variables to disk.
            if epoch_iter % self.save_step == 0:
                self.saver.save(self.sess, self.session_log_path + "models/epoch_{}".format(epoch_itr))
            
        self.make_records()


    def prepare_model(self):
        self.input_R = tf.placeholder(dtype=tf.float32, shape=[
                                      None, self.num_items], name="input_R")
        self.input_mask_R = tf.placeholder(
            dtype=tf.float32, shape=[None, self.num_items], name="input_mask_R")

        V = tf.get_variable(name="V", initializer=tf.truncated_normal(shape=[self.num_items, self.hidden_neuron],
                                                                      mean=0, stddev=0.03), dtype=tf.float32)
        W = tf.get_variable(name="W", initializer=tf.truncated_normal(shape=[self.hidden_neuron, self.num_items],
                                                                      mean=0, stddev=0.03), dtype=tf.float32)
        mu = tf.get_variable(name="mu", initializer=tf.zeros(
            shape=self.hidden_neuron), dtype=tf.float32)
        b = tf.get_variable(name="b", initializer=tf.zeros(
            shape=self.num_items), dtype=tf.float32)

        pre_Encoder = tf.matmul(self.input_R, V) + mu
        self.Encoder = tf.nn.sigmoid(pre_Encoder)
        
        pre_Decoder = tf.matmul(self.Encoder, W) + b
        self.decoder = tf.identity(pre_Decoder)

        pre_rec_cost = tf.multiply(
            (self.input_R - self.decoder), self.input_mask_R)
        rec_cost = tf.square(self.l2_norm(pre_rec_cost))
        pre_reg_cost = tf.square(self.l2_norm(W)) + tf.square(self.l2_norm(V))
        reg_cost = self.lambda_value * 0.5 * pre_reg_cost

        self.cost = rec_cost + reg_cost

        if self.optimizer_method == "Adam":
            optimizer = tf.train.AdamOptimizer(self.lr)
        elif self.optimizer_method == "RMSProp":
            optimizer = tf.train.RMSPropOptimizer(self.lr)
        else:
            raise ValueError("Optimizer Key ERROR")

        if self.grad_clip:
            gvs = optimizer.compute_gradients(self.cost)
            capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var)
                          for grad, var in gvs]
            self.optimizer = optimizer.apply_gradients(
                capped_gvs, global_step=self.global_step)
        else:
            self.optimizer = optimizer.minimize(
                self.cost, global_step=self.global_step)
            
        self.saver = tf.train.Saver(max_to_keep=None)

    def train_model(self, itr):
        start_time = time.time()
        random_perm_doc_idx = np.random.permutation(self.num_users)

        batch_cost = 0
        for i in tqdm(range(self.num_batch)):

            if i >= self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
            else:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size: (i + 1) * self.batch_size]

            _, cost = self.sess.run(
                [self.optimizer, self.cost],
                feed_dict={self.input_R: self.train_R[batch_set_idx, :].todense(),
                           self.input_mask_R: self.train_mask_R[batch_set_idx, :].todense()})

            batch_cost = batch_cost + cost

        self.train_cost_list.append(batch_cost)

        if (itr + 1) % self.display_step == 0:
            self.logger.log(
                "Training Epoch {}\tTotal cost = {:.2f}\tElapsed time : {} sec".format(
                    itr, batch_cost, (time.time() - start_time)))
            
            print(
                "===== Training =====\n"
                "Epoch {} \t Total cost = {:.2f}\n"
                "Elapsed time : {} sec\n".format(
                    itr, batch_cost, (time.time() - start_time)))

    def test_model(self, itr):
        start_time = time.time()

        batch_cost = 0
        numerator = 0
        
        for i in tqdm(range(self.num_batch)):

            # Batching idx
            batch_start_idx = i * self.batch_size
            if i >= self.num_batch - 1:
                batch_stop_idx = batch_start_idx + \
                                 (self.num_users - 1) % self.batch_size + 1
            else:
                batch_stop_idx = (i + 1) * self.batch_size

            cost, decoder = self.sess.run(
                [self.cost, self.decoder],
                feed_dict={self.input_R: self.test_r[batch_start_idx:batch_stop_idx].todense(),
                           self.input_mask_R: self.test_mask_R[batch_start_idx:batch_stop_idx].todense()})
            
            batch_cost += cost

            # Make prediction if need to show
            if (itr + 1) % self.display_step == 0:

                batch_predict_r = csr_matrix(decoder.clip(min=0.2, max=1))

                # Some statistic
                predicted_rating_delta = batch_predict_r - self.test_r[batch_start_idx:batch_stop_idx]
                pre_numerator = self.test_mask_R[batch_start_idx:batch_stop_idx].multiply(predicted_rating_delta)
                numerator += np.sum(pre_numerator.data ** 2)

        self.test_cost_list.append(batch_cost)

        # Make prediction if need to show
        if (itr + 1) % self.display_step == 0:

            denominator = self.num_test_ratings
            RMSE = np.sqrt(numerator / float(denominator))
            self.test_rmse_list.append(RMSE)

            self.logger.log(
                "Testing Epoch {}\tTotal cost = {:.2f}\tRMSE = {:.5f}\tElapsed time : {} sec".format(
                    itr, batch_cost, RMSE, (time.time() - start_time)))

            print(
                "===== Testing =====\n"
                "Epoch {} \t Total cost = {:.2f}\n"
                "RMSE = {:.5f} \t Elapsed time : {} sec\n".format(
                    itr, batch_cost, RMSE, (time.time() - start_time)))

    def make_records(self):

        basic_info = self.session_log_path + "basic_info.txt"
        train_record = self.session_log_path + "train_record.txt"
        test_record = self.session_log_path + "test_record.txt"

        with open(train_record, 'w') as f:
            f.write(str("cost:"))
            f.write('\t')
            for itr in range(len(self.train_cost_list)):
                f.write(str(self.train_cost_list[itr]))
                f.write('\t')
            f.write('\n')

        with open(test_record, 'w') as g:
            g.write(str("cost:"))
            g.write('\t')
            for itr in range(len(self.test_cost_list)):
                g.write(str(self.test_cost_list[itr]))
                g.write('\t')
            g.write('\n')

            g.write(str("RMSE:"))
            for itr in range(len(self.test_rmse_list)):
                g.write(str(self.test_rmse_list[itr]))
                g.write('\t')
            g.write('\n')

        with open(basic_info, 'w') as h:
            h.write(str(self.args))

    def l2_norm(self, tensor):
        return tf.sqrt(tf.reduce_sum(tf.square(tensor)))
   

## Model config

In [8]:
##### tf.set_random_seed(args.random_seed)
np.random.seed(args.random_seed)

path = "data/intersect-20m"

num_users = 138493;  
num_items = 15440; 
# num_total_ratings = 14094614;
train_ratio = 0.9

# Limit GPU usage
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

## Preprocess data

In [9]:
R, mask_R, train_R, train_mask_R, eval_R, eval_mask_R, n_train_R, n_eval_R, train_users_idx, train_items_idx, eval_users_idx, eval_items_idx = read_rating(path, train_ratio)

100%|██████████| 1409462/1409462 [02:47<00:00, 8419.26it/s]


In [23]:
n_train_R = len(R.nonzero()[0]) - n_eval_R

In [13]:
separator = int(R.shape[0] * 0.9)

test_R = R[separator:]
test_mask_R = mask_R[separator:]
test_train_R = train_R[separator:]
test_train_mask_R = train_mask_R[separator:]
test_eval_R = eval_R[separator:]
test_eval_mask_R = eval_mask_R[separator:]

R = R[:separator]
mask_R = mask_R[:separator]
train_R = train_R[:separator]
train_mask_R = train_mask_R[:separator]
eval_R = eval_R[:separator]
eval_mask_R = eval_mask_R[:separator]

In [20]:
n_test_eval_R = len(test_eval_R.nonzero()[0]) # non zero
n_test_train_R = len(test_R.nonzero()[0]) - n_test_eval_R # non zero from R - non zero eval R

In [24]:
filename = "{}/preprocessed_autorec_dataset".format(path)
pickle.dump((R, mask_R, train_R, train_mask_R, eval_R, eval_mask_R, n_train_R, n_eval_R, train_users_idx, train_items_idx, eval_users_idx, eval_items_idx), open(filename, 'wb'))

In [25]:
filename = "{}/preprocessed_autorec_dataset_test".format(path)
pickle.dump((test_R, test_mask_R, test_train_R, test_train_mask_R, test_eval_R, test_eval_mask_R, n_test_train_R, n_test_eval_R, train_users_idx, train_items_idx, eval_users_idx, eval_items_idx), open(filename, 'wb'))

## Load preprocessed dataset

In [26]:
filename = "{}/preprocessed_autorec_dataset".format(path)
R, mask_R, train_R, train_mask_R, eval_R, eval_mask_R, n_train_R, n_eval_R, train_users_idx, train_items_idx, eval_users_idx, eval_items_idx = pickle.load(open(filename, 'rb'))

In [27]:
R

<124643x15440 sparse matrix of type '<class 'numpy.float64'>'
	with 12702930 stored elements in Compressed Sparse Row format>

## Run model

with tf.Session(config=config) as sess:
    AutoRec = AutoRec(sess, args,
                      num_users, num_items,
                      R, mask_R, train_R, train_mask_R, eval_R, eval_mask_R,
                      n_train_R, n_eval_R,
                      train_users_idx, train_items_idx,
                      eval_users_idx, eval_items_idx)
    AutoRec.run()