# Check training result

In [1]:
TEST_CODE = "A1560731765.232393"
CHOSEN_EPOCH = 150

MODEL_PATH = "../log/{}/models/epoch_{}".format(TEST_CODE, CHOSEN_EPOCH)
LOG_PATH = "../log/{}/log.txt".format(TEST_CODE)

log = open(LOG_PATH).readlines()

Parameter

log[0]

train_log = log[1::2]
test_log = log[2::2]

train_cost = [float(line.split('\t')[1].replace("Total cost = ","")) for line in train_log][:-1]
test_cost = [float(line.split('\t')[1].replace("Total cost = ","")) for line in test_log]
test_RMSE = [float(line.split('\t')[2].replace("RMSE = ","")) for line in test_log]

n_train = int(138493 * 0.9)
n_test = int(138493 * 0.1)

train_cost = [x/n_train for x in train_cost]
test_cost = [x/n_test for x in test_cost]

## Plot

import matplotlib.pyplot as plt

plt.figure(figsize=(16,10))
plt.plot(range(0, len(train_cost)), train_cost, label="train")
plt.plot(range(0, len(test_cost)), test_cost, label="test")
plt.legend()

plt.figure(figsize=(16,10))
plt.plot(range(0, len(test_RMSE)), test_RMSE, label="RMSE")
plt.legend()

# Model

In [2]:
class Args:
    
    def __init__(self):
        self.hidden_neuron = 500
        self.lambda_value = 1.0
        
        self.train_epoch = 100
        self.batch_size = 100
        
        self.optimizer_method = 'Adam' # 'Adam','RMSProp'
        self.grad_clip = False
        self.base_lr = 0.001
        self.decay_epoch_step = 50
        
        self.random_seed = 1000
        self.display_step = 1
        self.save_step = 10
        
    def __str__(self):
        return "hidden_neuron : {}\nlambda_value : {}\ntrain_epoch : {}\nbatch_size : {}\noptimizer_method : {}\ngrad_clip : {}\nbase_lr : {}\ndecay_epoch_step : {}\nrandom_seed : {}\ndisplay_step : {}\n".format(
            self.hidden_neuron,
            self.lambda_value,
            self.train_epoch,
            self.batch_size,
            self.optimizer_method,
            self.grad_clip,
            self.base_lr,
            self.decay_epoch_step,
            self.random_seed,
            self.display_step,
            self.save_step)

args = Args()

In [3]:
import pickle
import os

class Logger:

    def set_default_filename(self, filename):
        self.default_filename = filename

    def create_session_folder(self, path):
        try:  
            os.makedirs(path)
        except OSError:  
            print ("Creation of the directory %s failed" % path)
        else:  
            print ("     ===> Successfully created the directory %s \n" % path)

    def log(self, text):
        with open(self.default_filename, 'a') as f:
            f.writelines(text)
            f.write("\n")

    def save_model(self, model, filename):
        pickle.dump(model, open(filename, 'wb'))
    
    

In [4]:
import tensorflow as tf
import time
import numpy as np
import os
import math
from datetime import datetime

class AutoRec():
    def __init__(self,sess,args,
                      num_users,num_items,
                      R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,
                      user_train_set, item_train_set, user_test_set, item_test_set,
                      result_path):

        self.sess = sess
        self.args = args

        self.num_users = num_users
        self.num_items = num_items

        self.R = R
        self.mask_R = mask_R
        self.C = C
        self.train_R = train_R
        self.train_mask_R = train_mask_R
        self.test_R = test_R
        self.test_mask_R = test_mask_R
        self.num_train_ratings = num_train_ratings
        self.num_test_ratings = num_test_ratings

        self.user_train_set = user_train_set
        self.item_train_set = item_train_set
        self.user_test_set = user_test_set
        self.item_test_set = item_test_set

        self.hidden_neuron = args.hidden_neuron
        self.train_epoch = args.train_epoch
        self.batch_size = args.batch_size
        self.num_batch = int(math.ceil(self.num_users / float(self.batch_size)))

        self.base_lr = args.base_lr
        self.optimizer_method = args.optimizer_method
        self.display_step = args.display_step
        self.random_seed = args.random_seed
        self.save_step = args.save_step
        
        self.global_step = tf.Variable(0, trainable=False)
        self.decay_epoch_step = args.decay_epoch_step
        self.decay_step = self.decay_epoch_step * self.num_batch
        self.lr = tf.train.exponential_decay(self.base_lr, self.global_step,
                                                   self.decay_step, 0.96, staircase=True)
        self.lambda_value = args.lambda_value

        self.train_cost_list = []
        self.test_cost_list = []
        self.test_rmse_list = []

        self.result_path = result_path
        self.grad_clip = args.grad_clip

    def run(self):
        self.prepare_model()
        init = tf.global_variables_initializer()
        self.sess.run(init)
        for epoch_itr in range(self.train_epoch):
            self.train_model(epoch_itr)
            self.test_model(epoch_itr)
            
        self.make_records()

    def prepare_model(self):
        self.input_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_R")
        self.input_mask_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_mask_R")

        V = tf.get_variable(name="V", initializer=tf.truncated_normal(shape=[self.num_items, self.hidden_neuron],
                                         mean=0, stddev=0.03),dtype=tf.float32)
        W = tf.get_variable(name="W", initializer=tf.truncated_normal(shape=[self.hidden_neuron, self.num_items],
                                         mean=0, stddev=0.03),dtype=tf.float32)
        mu = tf.get_variable(name="mu", initializer=tf.zeros(shape=self.hidden_neuron),dtype=tf.float32)
        b = tf.get_variable(name="b", initializer=tf.zeros(shape=self.num_items), dtype=tf.float32)

        pre_Encoder = tf.matmul(self.input_R,V) + mu
        self.Encoder = tf.nn.sigmoid(pre_Encoder)
        pre_Decoder = tf.matmul(self.Encoder,W) + b
        self.Decoder = tf.identity(pre_Decoder)

        pre_rec_cost = tf.multiply((self.input_R - self.Decoder) , self.input_mask_R)
        rec_cost = tf.square(self.l2_norm(pre_rec_cost))
        pre_reg_cost = tf.square(self.l2_norm(W)) + tf.square(self.l2_norm(V))
        reg_cost = self.lambda_value * 0.5 * pre_reg_cost

        self.cost = rec_cost + reg_cost

        if self.optimizer_method == "Adam":
            optimizer = tf.train.AdamOptimizer(self.lr)
        elif self.optimizer_method == "RMSProp":
            optimizer = tf.train.RMSPropOptimizer(self.lr)
        else:
            raise ValueError("Optimizer Key ERROR")

        if self.grad_clip:
            gvs = optimizer.compute_gradients(self.cost)
            capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
            self.optimizer = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
        else:
            self.optimizer = optimizer.minimize(self.cost, global_step=self.global_step)
            
        self.saver = tf.train.Saver(max_to_keep=None)

    def train_model(self,itr):
        start_time = time.time()
        random_perm_doc_idx = np.random.permutation(self.num_users)

        batch_cost = 0
        for i in range(self.num_batch):
            if i == self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
            elif i < self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size : (i+1) * self.batch_size]

            _, Cost = self.sess.run(
                [self.optimizer, self.cost],
                feed_dict={self.input_R: self.train_R[batch_set_idx, :],
                           self.input_mask_R: self.train_mask_R[batch_set_idx, :]})

            batch_cost = batch_cost + Cost
        self.train_cost_list.append(batch_cost)

        if (itr+1) % self.display_step == 0:
            print ("Training //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(batch_cost),
               "Elapsed time : %d sec" % (time.time() - start_time))

    def test_model(self,itr):
        start_time = time.time()
        Cost,Decoder = self.sess.run(
            [self.cost,self.Decoder],
            feed_dict={self.input_R: self.test_R,
                       self.input_mask_R: self.test_mask_R})

        self.test_cost_list.append(Cost)

        if (itr+1) % self.display_step == 0:
            Estimated_R = Decoder.clip(min=1, max=5)
            unseen_user_test_list = list(self.user_test_set - self.user_train_set)
            unseen_item_test_list = list(self.item_test_set - self.item_train_set)

            for user in unseen_user_test_list:
                for item in unseen_item_test_list:
                    if self.test_mask_R[user,item] == 1: # exist in test set
                        Estimated_R[user,item] = 3

            pre_numerator = np.multiply((Estimated_R - self.test_R), self.test_mask_R)
            numerator = np.sum(np.square(pre_numerator))
            denominator = self.num_test_ratings
            RMSE = np.sqrt(numerator / float(denominator))

            self.test_rmse_list.append(RMSE)

            print ("Testing //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(Cost), " RMSE = {:.5f}".format(RMSE),
                   "Elapsed time : %d sec" % (time.time() - start_time))
            print ("=" * 100)

    def make_records(self):
        if not os.path.exists(self.result_path):
            os.makedirs(self.result_path)

        basic_info = self.result_path + "basic_info.txt"
        train_record = self.result_path + "train_record.txt"
        test_record = self.result_path + "test_record.txt"

        with open (train_record,'w') as f:
            f.write(str("Cost:"))
            f.write('\t')
            for itr in range(len(self.train_cost_list)):
                f.write(str(self.train_cost_list[itr]))
                f.write('\t')
            f.write('\n')

        with open (test_record,'w') as g:
            g.write(str("Cost:"))
            g.write('\t')
            for itr in range(len(self.test_cost_list)):
                g.write(str(self.test_cost_list[itr]))
                g.write('\t')
            g.write('\n')

            g.write(str("RMSE:"))
            for itr in range(len(self.test_rmse_list)):
                g.write(str(self.test_rmse_list[itr]))
                g.write('\t')
            g.write('\n')

        with open(basic_info,'w') as h:
            h.write(str(self.args))

    def l2_norm(self,tensor):
        return tf.sqrt(tf.reduce_sum(tf.square(tensor)))

        
    def custom_run(self):
        
        # Log parameters
        self.prepare_model()
    
    def predict(self, rating, mask_rating):
        start_time = time.time()
        Cost,Decoder = self.sess.run(
            [self.cost,self.Decoder],
            feed_dict={self.input_R: rating,
                       self.input_mask_R: mask_rating})

        Estimated_R = Decoder.clip(min=1, max=5)
        return Estimated_R, Cost


In [5]:
import numpy as np

def read_rating(path, num_users, num_items,num_total_ratings, a, b, train_ratio):
    fp = open(path + "ratings.dat")

    user_train_set = set()
    user_test_set = set()
    item_train_set = set()
    item_test_set = set()

    R = np.zeros((num_users,num_items))
    mask_R = np.zeros((num_users, num_items))
    C = np.ones((num_users, num_items)) * b

    train_R = np.zeros((num_users, num_items))
    test_R = np.zeros((num_users, num_items))

    train_mask_R = np.zeros((num_users, num_items))
    test_mask_R = np.zeros((num_users, num_items))

    random_perm_idx = np.random.permutation(num_total_ratings)
    train_idx = random_perm_idx[0:int(num_total_ratings*train_ratio)]
    test_idx = random_perm_idx[int(num_total_ratings*train_ratio):]

    num_train_ratings = len(train_idx)
    num_test_ratings = len(test_idx)

    lines = fp.readlines()
    for line in lines:
        user,item,rating,_ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        R[user_idx,item_idx] = int(rating)
        mask_R[user_idx,item_idx] = 1
        C[user_idx,item_idx] = a

    ''' Train '''
    for itr in train_idx:
        line = lines[itr]
        user,item,rating,_ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        train_R[user_idx,item_idx] = int(rating)
        train_mask_R[user_idx,item_idx] = 1

        user_train_set.add(user_idx)
        item_train_set.add(item_idx)

    ''' Test '''
    for itr in test_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        test_R[user_idx, item_idx] = int(rating)
        test_mask_R[user_idx, item_idx] = 1

        user_test_set.add(user_idx)
        item_test_set.add(item_idx)

    return R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,\
user_train_set,item_train_set,user_test_set,item_test_set

## Check Precision at K

In [6]:
data_name = 'ml-1m'; num_users = 6040; num_items = 3952; num_total_ratings = 1000209; train_ratio = 0.9
path = "../data/%s" % data_name + "/"

In [7]:
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,\
user_train_set,item_train_set,user_test_set,item_test_set \
    = read_rating(path, num_users, num_items,num_total_ratings, 1, 0, train_ratio)

In [8]:
# Try to reduce size 

rating = test_R[::5]
mask_rating = test_mask_R[::5]

## ======

In [9]:
# Random seeding
tf.set_random_seed(args.random_seed)
np.random.seed(args.random_seed)

# # Detail about dataset
# path = "data/intersect-20m"
# num_users = 138493
# num_items = 15085
# train_ratio = 0.9

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [10]:
sess = tf.Session()

model = AutoRec(sess, args,
                  num_users, num_items,
                  None, None, [0], None, None, None, None,
                  None, None, None, None, None, None, None)

model.custom_run()

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
# Add ops to save and restore all the variables.
# saver = tf.train.Saver()
model.saver = tf.train.import_meta_graph(MODEL_PATH + ".meta")
model.saver.restore(sess, MODEL_PATH)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ../log/A1560731765.232393/models/epoch_150


In [12]:
prediction, cost = model.predict(rating, mask_rating)

In [13]:
import pandas as pd

prediction_df = pd.DataFrame(prediction)
prediction_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951
0,2.204063,2.281824,2.508855,2.636656,2.050621,2.901225,3.230408,2.889808,2.309319,2.794814,...,2.556404,2.196158,1.372741,2.698746,3.136119,2.222463,3.280812,3.128203,3.574883,3.828004
1,2.007906,2.595879,2.619907,2.574022,2.427499,2.542960,3.255998,2.781411,2.458611,2.793350,...,2.257682,2.074157,1.533782,2.526112,3.113491,2.417491,3.318329,3.022029,3.543466,3.578187
2,2.114829,2.359192,2.643916,2.813403,2.595495,2.723856,3.016824,3.007590,2.297531,3.218421,...,2.237384,2.257059,1.480815,2.558953,3.193767,2.758927,3.174821,3.139549,3.540039,3.519706
3,1.853091,2.455603,2.530600,2.464247,2.292834,2.792181,3.082422,2.793947,2.355839,2.781766,...,2.360770,2.087047,1.532710,2.577392,3.141732,2.226562,3.208303,2.968581,3.497815,3.287692
4,1.987429,2.584705,2.883335,2.449319,2.244421,2.645645,3.239743,2.879542,2.380621,2.765424,...,2.521222,2.111284,1.558431,2.639662,3.083656,2.339619,3.165364,3.087430,3.522880,3.509151
5,1.700457,2.690558,2.037353,2.140140,1.805658,2.545072,3.193014,2.716723,2.092128,2.622052,...,2.265531,1.953248,1.654615,2.287782,3.175575,2.671960,3.250857,2.813591,3.371726,3.238304
6,2.081212,2.517549,2.527244,2.548798,2.043949,2.456412,3.038711,2.703892,2.274828,3.033396,...,2.526461,2.005273,1.477685,2.373322,3.193669,2.247551,3.251569,2.970160,3.493636,3.497566
7,2.559250,2.406337,3.107833,2.323143,1.877866,2.559731,3.120678,2.767956,2.300587,3.159791,...,2.345009,2.075627,1.579844,2.417919,2.855128,2.326571,3.287862,2.872420,3.347455,2.663277
8,1.793790,2.551028,2.482888,2.426158,2.206209,2.672806,3.102398,2.775792,2.339990,2.770964,...,2.429587,2.110584,1.516062,2.565366,3.115144,2.308108,3.222681,2.984219,3.520897,3.363988
9,2.051970,2.641245,2.647405,2.486811,2.405931,2.916700,3.254618,3.029687,2.443018,2.727676,...,2.442813,2.204472,1.513645,2.593791,3.107909,2.858235,2.960285,3.087422,3.492496,3.061271


### get list of non zeros from test data

In [14]:
nonzero = rating.nonzero()
nonzero

(array([   0,    0,    0, ..., 1207, 1207, 1207]),
 array([ 587,  593,  607, ..., 3545, 3547, 3573]))

### find which row has rated item <sample>

In [15]:
avail_rating = []
for i in range(0, len(nonzero[0])):
    if nonzero[1][i] == 1544:
        avail_rating.appe nd(nonzero[0][i])
        
avail_rating

SyntaxError: invalid syntax (<ipython-input-15-0c8b6c59cfb5>, line 4)

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

In [None]:
sample_column = prediction_df.loc[:, 1544]
sample_column = np.array(sample_column)

### Try to normalize

In [None]:
# sample_column = sample_column - np.mean(sample_column)
# sample_column = sample_column * 100000

sample_column = sample_column.reshape(1, -1)
sample_column.shape

### Manually rated

In [None]:
for x in avail_rating:
    print(sample_column[x])

In [None]:
rating[0][1544]

In [None]:
rating[59][1544]

### Plot

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16,10))
plt.plot(range(0, len(sample_column)), sample_column)
plt.show()

THE DECODER SHOULDN'T BE CLIPPED AT 0.2 ??

THOSE RATED > 0.3 MOSLTY IS TRAIN DATA (RATED) NOT GENERATED..

if not clipped stuck at weird position (?) 

# Sort rating

In [None]:
prediction_df.iloc[100].sort_values(ascending=False)[:10]

In [None]:
prediction_df.iloc[105].sort_values(ascending=False)[:10]

# Sort rating

In [39]:
def getIndices(user,qua):
    return set(prediction_df.iloc[user].sort_values(ascending=False)[:qua].index)

In [44]:
isect = getIndices(np.random.randint(1,200),17)
print(sorted(isect))
for i in range(15):
    st = getIndices(np.random.randint(1,200),17)
    print(sorted(st))
    isect = isect.intersection(st)
print(sorted(isect),len(isect))

[438, 667, 1116, 1583, 1829, 2027, 2072, 2196, 2359, 2904, 3089, 3171, 3231, 3241, 3381, 3409, 3678]
[438, 667, 1116, 1302, 1829, 1899, 2072, 2196, 2395, 2904, 3231, 3241, 3381, 3409, 3414, 3469, 3678]
[438, 667, 1116, 1212, 1829, 1899, 2072, 2196, 2761, 2803, 2904, 3231, 3241, 3381, 3409, 3469, 3678]
[438, 667, 1066, 1116, 1829, 1899, 2072, 2196, 2608, 2904, 3231, 3241, 3381, 3409, 3414, 3469, 3678]
[438, 667, 1116, 1829, 1899, 2072, 2196, 2479, 2608, 2904, 3089, 3231, 3241, 3381, 3409, 3577, 3678]
[438, 667, 1066, 1116, 1829, 2072, 2196, 2359, 2904, 3089, 3133, 3231, 3241, 3381, 3409, 3469, 3678]
[438, 667, 1066, 1116, 1283, 1829, 2072, 2196, 2904, 3089, 3231, 3241, 3381, 3409, 3414, 3469, 3678]
[438, 667, 1066, 1116, 1829, 1899, 2072, 2196, 2608, 2904, 3231, 3241, 3381, 3409, 3414, 3469, 3678]
[38, 438, 588, 627, 667, 954, 1116, 1746, 1829, 2196, 2904, 2967, 3231, 3241, 3381, 3409, 3678]
[438, 667, 1066, 1116, 1829, 1899, 2072, 2196, 2608, 2904, 3089, 3231, 3241, 3381, 3409, 3414, 3

In [34]:
np.random.randint

<module 'numpy.random' from '/home/jessinra/.local/lib/python3.5/site-packages/numpy/random/__init__.py'>