In [1]:
import os
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [8]:
cd "/content/drive/My Drive/Code/gcmc/"

/content/drive/My Drive/Code/gcmc


In [None]:
!python setup.py install
!pip3 install pickle-mixin

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import scipy.sparse as sp
import random
import argparse
import datetime
import time
import os.path

from sklearn.preprocessing import LabelEncoder
from __future__ import division
from __future__ import print_function

import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
import sys
import json

from model import RecommenderGAE, RecommenderSideInfoGAE
from utils import construct_feed_dict

In [4]:
# Set random seed
# seed = 123 # use only for unit testing
seed = int(time.time())
np.random.seed(seed)
tf.set_random_seed(seed)

NUMCLASSES = 5
FEATURES = False 
SELFCONNECTIONS = False
ACCUM = sum
HIDDEN = [500,75]
DO = 0.7 #dropout
verbose = True

u_features = None
v_features = None

In [9]:
for dirname, _, filenames in os.walk('./data/netflix'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/netflix/combined_data_1.txt
./data/netflix/combined_data_2.txt
./data/netflix/combined_data_3.txt
./data/netflix/.DS_Store


In [10]:
%time

# DataFrame to store all imported data
if not os.path.isfile('./data/netflix/data.csv'):
    data = open('./data/netflix/data.csv', mode='w')

    files = ['./data/netflix/combined_data_1.txt',
            './data/netflix/combined_data_2.txt',
            './data/netflix/combined_data_3.txt'
            #'./data/netflix/combined_data_4.txt'
            ]

    # Remove the line with movie_id: and add a new column of movie_id
    # Combine all data files into a csv file
    for file in files:
      print("Opening file: {}".format(file))
      with open(file) as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                movie_id = line.replace(':', '')
            else:
                data.write(movie_id + ',' + line)
                data.write('\n')
    data.close()

CPU times: user 0 ns, sys: 2 µs, total: 2 µs
Wall time: 5.72 µs
Opening file: ./data/netflix/combined_data_1.txt
Opening file: ./data/netflix/combined_data_2.txt
Opening file: ./data/netflix/combined_data_3.txt


In [11]:
# Read all data into a pd dataframe
df = pd.read_csv('./data/netflix/data.csv', names=['movie_id', 'user_id', 'ratings', 'timestamp'])
print(df.nunique())
df

movie_id      13367
user_id      479453
ratings           5
timestamp      2182
dtype: int64


Unnamed: 0,movie_id,user_id,ratings,timestamp
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03
...,...,...,...,...
73632979,13367,2339129,4,2002-10-07
73632980,13367,59005,4,2005-01-21
73632981,13367,1789683,5,2005-05-25
73632982,13367,1878798,1,2003-02-17


In [12]:
lite_rating_df = pd.DataFrame()

# 4000 top users (who gave the most rates) - Remove customer who give too less ratings (they are relatively less active)
group = df.groupby('user_id')['ratings'].count()
top_users = group.sort_values(ascending=False)[:4000]

# 2000 top rated movies - Remove movie with too less ratings (they are relatively not popular)
group = df.groupby('movie_id')['ratings'].count()
top_movies = group.sort_values(ascending=False)[:2000]

lite_rating_df = df.join(top_users, rsuffix='_r', how='inner', on='user_id')
lite_rating_df = lite_rating_df.join(top_movies, rsuffix='_r', how='inner', on='movie_id')

# Re-name the users and movies for uniform name from 1..2000 and 4000
user_enc = LabelEncoder()
lite_rating_df['u_nodes'] = user_enc.fit_transform(lite_rating_df['user_id'].values)
movie_enc = LabelEncoder()
lite_rating_df['v_nodes'] = movie_enc.fit_transform(lite_rating_df['movie_id'].values)

n_movies = lite_rating_df['v_nodes'].nunique()
n_users = lite_rating_df['u_nodes'].nunique()

lite_rating_df
#print(n_movies, n_users)

Unnamed: 0,movie_id,user_id,ratings,timestamp,ratings_r,ratings_r.1,u_nodes,v_nodes
5149,8,1488844,4,2005-05-12,1634,14910,2252,0
5883,8,1227322,5,2005-05-31,3552,14910,1870,0
6183,8,525356,4,2005-08-26,3669,14910,800,0
6310,8,401047,2,2005-06-15,1569,14910,597,0
7140,8,883478,5,2005-10-10,2391,14910,1330,0
...,...,...,...,...,...,...,...,...
14084763,2705,1686713,3,2002-10-25,1347,6874,2546,387
14086177,2705,753596,2,2004-05-14,1333,6874,1122,387
14083158,2705,1170335,3,2004-12-12,1442,6874,1774,387
14083299,2705,666500,4,2002-12-18,1132,6874,1000,387


In [13]:
lite_rating_df = lite_rating_df.drop(['movie_id', 'user_id', 'ratings_r', 'timestamp'], axis=1)
lite_rating_df = lite_rating_df[["u_nodes",	"v_nodes", "ratings"]]
lite_rating_df.to_csv('./data/netflix/data_clean.csv',index=False)  

In [14]:
dtypes = {
    'u_nodes': np.int64, 'v_nodes': np.int64,
    'ratings': np.float32}

data = pd.read_csv('./data/netflix/data_clean.csv', converters=dtypes, engine='python')
data

Unnamed: 0,u_nodes,v_nodes,ratings
0,2252,0,4.0
1,1870,0,5.0
2,800,0,4.0
3,597,0,2.0
4,1330,0,5.0
...,...,...,...
3830628,2546,387,3.0
3830629,1122,387,2.0
3830630,1774,387,3.0
3830631,1000,387,4.0


In [15]:
# make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
data_array = data.values.tolist()
random.seed(seed)
random.shuffle(data_array)
data_array = np.array(data_array)

u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
ratings = data_array[:, 2].astype(dtypes['ratings'])

# Map data to proper indices in case they are not in a continues [0, N) range
def map_data(data):
  uniq = list(set(data))

  id_dict = {old: new for new, old in enumerate(sorted(uniq))}
  data = np.array(list(map(lambda x: id_dict[x], data)))
  n = len(uniq)

  return data, id_dict, n

u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

u_nodes_ratings = u_nodes_ratings.astype(np.int64)
v_nodes_ratings = v_nodes_ratings.astype(np.int64)
ratings = ratings.astype(np.float32)

if verbose:
    print('Number of users = %d' % num_users)
    print('Number of items = %d' % num_items)
    print('Number of links = %d' % ratings.shape[0])
    print('Fraction of positive links = %.4f' % (float(ratings.shape[0]) / (num_users * num_items),))


Number of users = 4000
Number of items = 2000
Number of links = 3830633
Fraction of positive links = 0.4788


In [16]:
import pickle as pkl
datasplit_path = 'data/' + 'netflix' + '/split_seed' + str('netflix') + '.pickle'

with open(datasplit_path, 'wb+') as f:
    pkl.dump([num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features], f)

neutral_rating = -1

rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
labels[u_nodes_ratings, v_nodes_ratings] = np.array([rating_dict[r] for r in ratings])
labels = labels.reshape([-1])

# number of test and validation edges
num_test = int(np.ceil(ratings.shape[0] * 0.1))
num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))

num_train = ratings.shape[0] - num_val - num_test

pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes_ratings, v_nodes_ratings)])

idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

train_idx = idx_nonzero[0:num_train]
val_idx = idx_nonzero[num_train:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

train_pairs_idx = pairs_nonzero[0:num_train]
val_pairs_idx = pairs_nonzero[num_train:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

u_test_idx, v_test_idx = test_pairs_idx.transpose()
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]


#testing==true
u_train_idx = np.hstack([u_train_idx, u_val_idx])
v_train_idx = np.hstack([v_train_idx, v_val_idx])
train_labels = np.hstack([train_labels, val_labels])
# for adjacency matrix construction
train_idx = np.hstack([train_idx, val_idx])

# make training adjacency matrix
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

class_values = np.sort(np.unique(ratings))
rating_mx_train

<4000x2000 sparse matrix of type '<class 'numpy.float32'>'
	with 3447569 stored elements in Compressed Sparse Row format>

In [17]:
adj_train = rating_mx_train
train_u_indices = u_train_idx
train_v_indices = v_train_idx
val_u_indices = u_val_idx
val_v_indices = v_val_idx
test_u_indices = u_test_idx
test_v_indices = v_test_idx

num_users, num_items = adj_train.shape
num_side_features = 0

def preprocess_user_item_features(u_features, v_features):
    """
    Creates one big feature matrix out of user features and item features.
    Stacks item features under the user features.
    """

    zero_csr_u = sp.csr_matrix((u_features.shape[0], v_features.shape[1]), dtype=u_features.dtype)
    zero_csr_v = sp.csr_matrix((v_features.shape[0], u_features.shape[1]), dtype=v_features.dtype)

    u_features = sp.hstack([u_features, zero_csr_u], format='csr')
    v_features = sp.hstack([zero_csr_v, v_features], format='csr')

    return u_features, v_features

# no Features: Netflix dataset 

# scipy.sparse.identity(n, dtype='d', format=None) 
u_features = sp.identity(num_users, format='csr')
v_features = sp.identity(num_items, format='csr')

u_features, v_features = preprocess_user_item_features(u_features, v_features)

# global normalization
support = []
support_t = []
adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32)

for i in range(NUMCLASSES):
    # build individual binary rating matrices (supports) for each rating
    support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32)

    if support_unnormalized.nnz == 0 :
        # yahoo music has dataset split with not all ratings types present in training set.
        # this produces empty adjacency matrices for these ratings.
        sys.exit('ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!')

    support_unnormalized_transpose = support_unnormalized.T
    support.append(support_unnormalized)
    support_t.append(support_unnormalized_transpose)

def globally_normalize_bipartite_adjacency(adjacencies, verbose=False, symmetric=True): #来自 preprocessing.py
    """ Globally Normalizes set of bipartite adjacency matrices """

    if verbose:
        print('Symmetrically normalizing bipartite adj')
    # degree_u and degree_v are row and column sums of adj+I

    adj_tot = np.sum(adj for adj in adjacencies)
    degree_u = np.asarray(adj_tot.sum(1)).flatten()
    degree_v = np.asarray(adj_tot.sum(0)).flatten()

    # set zeros to inf to avoid dividing by zero
    degree_u[degree_u == 0.] = np.inf
    degree_v[degree_v == 0.] = np.inf

    degree_u_inv_sqrt = 1. / np.sqrt(degree_u)
    degree_v_inv_sqrt = 1. / np.sqrt(degree_v)
    degree_u_inv_sqrt_mat = sp.diags([degree_u_inv_sqrt], [0])
    degree_v_inv_sqrt_mat = sp.diags([degree_v_inv_sqrt], [0])

    degree_u_inv = degree_u_inv_sqrt_mat.dot(degree_u_inv_sqrt_mat)

    if symmetric:
        adj_norm = [degree_u_inv_sqrt_mat.dot(adj).dot(degree_v_inv_sqrt_mat) for adj in adjacencies]

    else:
        adj_norm = [degree_u_inv.dot(adj) for adj in adjacencies]

    return adj_norm

support = globally_normalize_bipartite_adjacency(support, symmetric=True)
support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=True)

# SELFCONNECTIONS = False
if SELFCONNECTIONS:
    support.append(sp.identity(u_features.shape[0], format='csr'))
    support_t.append(sp.identity(v_features.shape[0], format='csr'))

num_support = len(support)
support = sp.hstack(support, format='csr')
support_t = sp.hstack(support_t, format='csr')

# Collect all user and item nodes for test set
test_u = list(set(test_u_indices))
test_v = list(set(test_v_indices))
test_u_dict = {n: i for i, n in enumerate(test_u)}
test_v_dict = {n: i for i, n in enumerate(test_v)}

test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])

test_support = support[np.array(test_u)]
test_support_t = support_t[np.array(test_v)]

# Collect all user and item nodes for validation set
val_u = list(set(val_u_indices))
val_v = list(set(val_v_indices))
val_u_dict = {n: i for i, n in enumerate(val_u)}
val_v_dict = {n: i for i, n in enumerate(val_v)}

val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])

val_support = support[np.array(val_u)]
val_support_t = support_t[np.array(val_v)]

# Collect all user and item nodes for train set
train_u = list(set(train_u_indices))
train_v = list(set(train_v_indices))
train_u_dict = {n: i for i, n in enumerate(train_u)}
train_v_dict = {n: i for i, n in enumerate(train_v)}

train_u_indices = np.array([train_u_dict[o] for o in train_u_indices])
train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])

train_support = support[np.array(train_u)]
train_support_t = support_t[np.array(train_v)]

# no features as side info ：Netflix has not side features

test_u_features_side = None
test_v_features_side = None

val_u_features_side = None
val_v_features_side = None

train_u_features_side = None
train_v_features_side = None

placeholders = {
    'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)),
    'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)),
    'u_features_nonzero': tf.placeholder(tf.int32, shape=()),
    'v_features_nonzero': tf.placeholder(tf.int32, shape=()),
    'labels': tf.placeholder(tf.int32, shape=(None,)),

    'u_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),
    'v_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),

    'user_indices': tf.placeholder(tf.int32, shape=(None,)),
    'item_indices': tf.placeholder(tf.int32, shape=(None,)),

    'class_values': tf.placeholder(tf.float32, shape=class_values.shape),

    'dropout': tf.placeholder_with_default(0., shape=()),
    'weight_decay': tf.placeholder_with_default(0., shape=()),

    'support': tf.sparse_placeholder(tf.float32, shape=(None, None)),
    'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)),
}

model = RecommenderGAE(placeholders,
                           input_dim=u_features.shape[1],
                           num_classes=NUMCLASSES,
                           num_support=num_support,
                           self_connections=SELFCONNECTIONS,
                           num_basis_functions=2,
                           hidden=HIDDEN,
                           num_users=num_users,
                           num_items=num_items,
                           accum="sum",
                           learning_rate=0.01,
                           logging=True)

def sparse_to_tuple(sparse_mx):
    """ change of format for sparse matrix. This format is used
    for the feed_dict where sparse matrices need to be linked to placeholders
    representing sparse matrices. """

    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

# Convert sparse placeholders to tuples to construct feed_dict
test_support = sparse_to_tuple(test_support)
test_support_t = sparse_to_tuple(test_support_t)

val_support = sparse_to_tuple(val_support)
val_support_t = sparse_to_tuple(val_support_t)

train_support = sparse_to_tuple(train_support)
train_support_t = sparse_to_tuple(train_support_t)

u_features = sparse_to_tuple(u_features)
v_features = sparse_to_tuple(v_features)
assert u_features[2][1] == v_features[2][1], 'Number of features of users and items must be the same!'

num_features = u_features[2][1]
u_features_nonzero = u_features[1].shape[0]
v_features_nonzero = v_features[1].shape[0]

# Feed_dicts for validation and test set stay constant over different update steps
train_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                      v_features_nonzero, train_support, train_support_t,
                                      train_labels, train_u_indices, train_v_indices, class_values, DO,
                                      train_u_features_side, train_v_features_side)
# No dropout for validation and test runs
val_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                    v_features_nonzero, val_support, val_support_t,
                                    val_labels, val_u_indices, val_v_indices, class_values, 0.,
                                    val_u_features_side, val_v_features_side)

test_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                     v_features_nonzero, test_support, test_support_t,
                                     test_labels, test_u_indices, test_v_indices, class_values, 0.,
                                     test_u_features_side, test_v_features_side)

# Collect all variables to be logged into summary
merged_summary = tf.summary.merge_all()

sess = tf.Session()
sess.run(tf.global_variables_initializer())

train_summary_writer = None
val_summary_writer = None

best_val_score = np.inf
best_val_loss = np.inf
best_epoch = 0
wait = 0



Instructions for updating:
Use `tf.cast` instead.


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


In [None]:
print('Training...')
NB_EPOCH = 3500
TESTING = True
for epoch in range(NB_EPOCH):

    t = time.time()

    # Run single weight update
    # outs = sess.run([model.opt_op, model.loss, model.rmse], feed_dict=train_feed_dict)
    # with exponential moving averages
    outs = sess.run([model.training_op, model.loss, model.rmse], feed_dict=train_feed_dict)

    train_avg_loss = outs[1]
    train_rmse = outs[2]

    val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict)

    
    print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_avg_loss),
          "train_rmse=", "{:.5f}".format(train_rmse),
          "val_loss=", "{:.5f}".format(val_avg_loss),
          "val_rmse=", "{:.5f}".format(val_rmse),
          "\t\ttime=", "{:.5f}".format(time.time() - t))

    if val_rmse < best_val_score:
        best_val_score = val_rmse
        best_epoch = epoch


    if epoch % 100 == 0 and epoch > 1000 and not TESTING and False:
        saver = tf.train.Saver()
        save_path = saver.save(sess, "tmp/%s_seed%d.ckpt" % (model.name, DATASEED), global_step=model.global_step)

        # load polyak averages
        variables_to_restore = model.variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        saver.restore(sess, save_path)

        val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict)

        print('polyak val loss = ', val_avg_loss)
        print('polyak val rmse = ', val_rmse)

        # Load back normal variables
        saver = tf.train.Saver()
        saver.restore(sess, save_path)

Training...
[*] Epoch: 0001 train_loss= 1.60944 train_rmse= 1.16769 val_loss= 1.60942 val_rmse= 1.16681 		time= 5.33231
[*] Epoch: 0002 train_loss= 1.60943 train_rmse= 1.16766 val_loss= 1.60918 val_rmse= 1.16626 		time= 2.76315
[*] Epoch: 0003 train_loss= 1.60929 train_rmse= 1.16736 val_loss= 1.60437 val_rmse= 1.15543 		time= 2.80995
[*] Epoch: 0004 train_loss= 1.60700 train_rmse= 1.16228 val_loss= 1.57459 val_rmse= 1.06314 		time= 2.82489
[*] Epoch: 0005 train_loss= 1.58549 train_rmse= 1.11263 val_loss= 2.26174 val_rmse= 1.44943 		time= 2.79521
[*] Epoch: 0006 train_loss= 1.63990 train_rmse= 1.09622 val_loss= 1.61240 val_rmse= 1.08779 		time= 2.80244
[*] Epoch: 0007 train_loss= 1.53392 train_rmse= 1.06043 val_loss= 1.56876 val_rmse= 1.14728 		time= 2.75282
[*] Epoch: 0008 train_loss= 1.54152 train_rmse= 1.14381 val_loss= 1.63684 val_rmse= 1.17666 		time= 2.75639
[*] Epoch: 0009 train_loss= 1.55537 train_rmse= 1.16892 val_loss= 1.54319 val_rmse= 1.12730 		time= 2.75557
[*] Epoch: 0010 

In [None]:
# store model including exponential moving averages
saver = tf.train.Saver()
save_path = saver.save(sess, "tmp/%s.ckpt" % model.name, global_step=model.global_step)

print("\nOptimization Finished!")
print('best validation score =', best_val_score, 'at iteration', best_epoch)

# Testing
test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict)
print('test loss = ', test_avg_loss)
print('test rmse = ', test_rmse)

# restore with polyak averages of parameters
variables_to_restore = model.variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
saver.restore(sess, save_path)

test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict)
print('polyak test loss = ', test_avg_loss)
print('polyak test rmse = ', test_rmse)


sess.close()


Optimization Finished!
best validation score = 0.8714391 at iteration 3433
test loss =  1.4242109
test rmse =  0.8879364
INFO:tensorflow:Restoring parameters from tmp/recommendergae.ckpt-3500
polyak test loss =  1.392758
polyak test rmse =  0.8859175
