In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tf_agents
import os
import random
from collections import defaultdict
from tqdm import tqdm

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment

from tf_agents.trajectories import time_step as ts
from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

import tensorflow_probability as tfp
from tf_agents.utils import nest_utils

In [73]:
ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, "data")

In [76]:
ratings_df = pd.read_csv(os.path.join(DATA_DIR,'valid_ratings_df_train.csv'))
test_ratings_df = pd.read_csv(os.path.join(DATA_DIR,'valid_ratings_df_test.csv'))

In [3]:
#Loading datasets
# read dat file
ratings_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'ratings.dat'), 'r').readlines()]
users_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'users.dat'), 'r').readlines()]
movies_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'movies.dat'),encoding='latin-1').readlines()]

# Craete DataFrame
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = np.uint32)
ratings_df = ratings_df.astype(int).sort_values(["UserID", "Timestamp"])


movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)
users_df = pd.DataFrame(users_list, columns=['UserID','Gender','Age','Occupation','Zip-code'])

In [4]:
EMBEDDING_DIM = 100
STATE_SIZE = 10
ACTOR_LEARNIG_RATE = 0.001
CRITIC_LEARNIG_RATE = 0.001

log_interval = 25
eval_interval  = 50

NUM_EVAL_EPISODES = 10

REPLAY_BUFFER_MAX_LENGTH = 50000
NUM_EPISODE = 10000
BATCH_SIZE = 32

### Embedding

In [5]:
class UserMovieEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_movies, embedding_dim):
        super(UserMovieEmbedding, self).__init__()
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        # embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='movie_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, x):
        x = self.m_u_input(x)
        uemb = self.u_embedding(x[0])
        memb = self.m_embedding(x[1])
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

In [6]:
users_num = ratings_df["UserID"].max() + 1
items_num = ratings_df["MovieID"].max() + 1

embedding_network = UserMovieEmbedding(users_num, items_num, EMBEDDING_DIM)
embedding_network([np.zeros((1,)),np.zeros((1,))])
embedding_network.load_weights('save_weights/user_movie_embedding_case4.h5')

items_ids = np.array(range(items_num))
movie_embedding = embedding_network.get_layer('movie_embedding')(items_ids)

### Environment

In [7]:
class DRRAveStateRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(DRRAveStateRepresentation, self).__init__()
        self.embedding_dim = embedding_dim
        self.wav = tf.keras.layers.Conv1D(1, 1, 1)
        self.concat = tf.keras.layers.Concatenate()
        self.flatten = tf.keras.layers.Flatten()
        
    def call(self, x):
        items_eb = tf.transpose(x[1], perm=(0,2,1))/self.embedding_dim
        wav = self.wav(items_eb)
        wav = tf.transpose(wav, perm=(0,2,1))
        wav = tf.squeeze(wav, axis=1)
        user_wav = tf.keras.layers.multiply([x[0], wav])
        concat = self.concat([x[0], user_wav, wav])
        return self.flatten(concat)

In [8]:
class RS_Env(py_environment.PyEnvironment):
    def __init__(self, ratings_df, embedding_dim, state_size, embedding_network):
        self.users_num = ratings_df["UserID"].max() + 1
        self.items_num = ratings_df["MovieID"].max() + 1
        self.ratings_df = ratings_df
        self.pos_ratings_df = ratings_df.loc[ratings_df["Rating"] >= 4]
        self.embedding_dim = embedding_dim
        self.embedding_network = embedding_network
        self.state_size = state_size
        self.max_step = 1000
        
            
        self._action_spec = array_spec.BoundedArraySpec(shape = (embedding_dim, ), dtype = np.float32, maximum = 1, minimum = -1, name = "action")
        self._observation_spec = array_spec.ArraySpec(shape = (3*self.embedding_dim, ), dtype = np.float32, name = "state_representation")
        
        
        self.valid_users = self._generate_valid_user()
        
        # reset env
        self.user_id = np.random.choice(self.valid_users, size = 1).item()
        
        self.reset()
        
        
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _convert_action_score_item(self, action_score):
        items_ids = np.array(range(self.items_num))
        
        items_ids = np.setdiff1d(items_ids, self.recommended_items)
        
        items_ebs = self.embedding_network.get_layer('movie_embedding')(items_ids)
#         action_score = tf.transpose(action_score, perm=(1,0))
        action_score = tf.convert_to_tensor(np.expand_dims(action_score, 1))
        
        item_idx = np.argmax(tf.keras.backend.dot(items_ebs, action_score))
        
        recommendation_item = int(items_ids[item_idx])
        
        return recommendation_item
        
    
    def _reset(self):        
        self.step_count = 1
        self.user_id = np.random.choice(self.valid_users, size = 1).item()
        
        self.user_df = self.ratings_df.loc[self.ratings_df["UserID"] == self.user_id]
#         self.movie_rate_dict = defaultdict(lambda: -0.5, zip(self.user_df["MovieID"], self.user_df["Rating"]))
        self.state_items_ids = self.user_df.loc[self.user_df["Rating"] >= 4, "MovieID"].head(self.state_size).values
        self.user_items = self.user_df["MovieID"].values
        
        self.recommended_items = self.state_items_ids.copy()
        

        self.user_eb = self.embedding_network.get_layer('user_embedding')(np.array(self.user_id))
        state_items_eb = self.embedding_network.get_layer('movie_embedding')(np.array(self.state_items_ids))
        
        
        self.srm_ave = DRRAveStateRepresentation(self.embedding_dim)
        self._state = self.srm_ave([np.expand_dims(self.user_eb, axis=0), np.expand_dims(state_items_eb, axis=0)])[0]
        
#         for x in self.recommended_items:
#             self.movie_rate_dict[x] = -0.5
            
        self._episode_ended = False
        
        return ts.restart(self._state)
        
        
    def _generate_valid_user(self):
        temp = self.ratings_df.loc[ratings_df["Rating"] >= 4].groupby(["UserID"])["Rating"].count()
        valid_users = temp.loc[temp >= self.state_size].index
        
        return valid_users
    
    def _step(self, action):
        self.step_count += 1
        
        if self._episode_ended:
            return self.reset()    
        
        recommendation_item = self._convert_action_score_item(action)
        self.recommendation_item = recommendation_item
        
        if recommendation_item in self.user_items:
            if recommendation_item not in self.recommended_items:
                rate = self.user_df.loc[self.user_df["MovieID"] == recommendation_item, "Rating"].values[0]
                reward = (rate-3)/2
                if reward > 0:
                    self.state_items_ids = np.append(self.state_items_ids[1:], values = recommendation_item)
                    state_items_eb = self.embedding_network.get_layer('movie_embedding')(np.array(self.state_items_ids))
                    self._state = self.srm_ave([np.expand_dims(self.user_eb, axis=0), np.expand_dims(state_items_eb, axis=0)])[0]
            else:
                reward = 0
        else:
            reward = -0.1
        
        self.recommended_items = np.unique(np.append(self.recommended_items, recommendation_item))
        
        
#         if len(self.recommended_items) >= 20:
#             self._episode_ended = True
        if self.step_count == self.max_step or len(np.setdiff1d(self.user_items, self.recommendation_item)) == 0:
            self._episode_ended = True
        
        
        
        if self._episode_ended:
            return ts.termination(np.array(self._state), reward)
        else:
            return ts.transition(np.array(self._state), reward, discount = 0.9)

In [9]:
train_env_py = RS_Env(ratings_df, embedding_dim = 100, state_size = 10, embedding_network = embedding_network)
eval_env_py = RS_Env(ratings_df, embedding_dim = 100, state_size = 10, embedding_network = embedding_network)

In [10]:
train_env_tf = tf_py_environment.TFPyEnvironment(train_env_py)
eval_env_tf = tf_py_environment.TFPyEnvironment(eval_env_py)

### Actor Network

In [11]:
from tf_agents.networks import network

In [12]:
actor_net  = tf_agents.agents.ddpg.actor_network.ActorNetwork(
    input_tensor_spec = train_env_tf.observation_spec(), 
    output_tensor_spec = train_env_tf.action_spec(),  # --> Only float actions are supported by this network.
    fc_layer_params=[128, 128],
    activation_fn = tf.nn.relu,
    name = "ActorNetwork"
)

In [13]:
items_ids = np.array(range(items_num))
movie_embedding = embedding_network.get_layer('movie_embedding')(items_ids)

In [14]:
# actor_net = ActorNetwork(tf_rs_env.observation_spec(), tf_rs_env.action_spec(), embedding_dim = EMBEDDING_DIM, hidden_dim = 128, items_num = items_num, movie_embedding = movie_embedding, name = "ActorNetwork2323")

# target_actor_net = ActorNetwork(tf_rs_env.observation_spec(), tf_rs_env.action_spec(), embedding_dim = EMBEDDING_DIM, hidden_dim = 128, items_num = items_num, movie_embedding = movie_embedding, name = "TargetActorNetwork2323")

### Critic Network

In [15]:
critic_net = tf_agents.agents.ddpg.critic_network.CriticNetwork(
    input_tensor_spec = (train_env_tf.observation_spec(), train_env_tf.action_spec()),
    observation_fc_layer_params = [100],
    joint_fc_layer_params = [128, 128],
    activation_fn = tf.nn.relu,
    output_activation_fn = tf.nn.relu,
    name='CriticNetwork'
)

In [16]:
# ACTOR_LEARNIG_RATE = 0.001
# CRITIC_LEARNIG_RATE = 0.001

### Agent

In [17]:
# from tf_agents.train.utils import train_utils
# train_step = train_utils.create_train_step()

In [36]:
global_step = tf.compat.v1.train.get_or_create_global_step()

In [37]:
tf_ddpg_agent = tf_agents.agents.DdpgAgent(time_step_spec = train_env_tf.time_step_spec(),
                                           action_spec = train_env_tf.action_spec(),
                                           actor_network = actor_net,
                                           critic_network = critic_net,
                                           actor_optimizer = tf.keras.optimizers.Adam(learning_rate=ACTOR_LEARNIG_RATE),
                                           critic_optimizer = tf.keras.optimizers.Adam(learning_rate=CRITIC_LEARNIG_RATE),
#                                            target_actor_network = target_actor_net,
#                                            target_critic_network = target_critic_net,
                                           target_update_tau = 0.001,
                                           target_update_period = 1,
                                           gamma = 0.9,
                                           ou_stddev=0.5,
                                           ou_damping=0.15,
                                           train_step_counter = global_step
                                           )

In [38]:
tf_ddpg_agent.initialize()

In [39]:
# REPLAY_BUFFER_MAX_LENGTH = 50000
# NUM_EPISODE = 10000
# BATCH_SIZE = 32

In [40]:
my_policy = tf_ddpg_agent.collect_policy
saver = PolicySaver(my_policy, batch_size = None)



### Metrics and Evaluation

In [41]:
# NUM_EVAL_EPISODES = 10

In [42]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

    while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = environment.step(action_step.action)
        episode_return += time_step.reward
    total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [43]:
avg_return = compute_avg_return(eval_env_tf, tf_ddpg_agent.policy, NUM_EVAL_EPISODES)

### Replay Buffer

In [31]:
from tf_agents.metrics import tf_metrics

In [32]:
REPLAY_BUFFER_MAX_LENGTH

50000

In [33]:
# Make the replay buffer.
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec = tf_ddpg_agent.collect_data_spec,
                                                               batch_size = BATCH_SIZE,
                                                               max_length = REPLAY_BUFFER_MAX_LENGTH)
replay_observer = [replay_buffer.add_batch]

In [34]:
num_eval_episodes = 10

In [35]:
env_steps = tf_metrics.EnvironmentSteps(prefix='Train')

average_return = tf_metrics.AverageReturnMetric(prefix='Train',
                                                buffer_size=num_eval_episodes,
                                                batch_size=train_env_tf.batch_size)

train_metrics = [tf_metrics.NumberOfEpisodes(prefix = 'Train'),
                 env_steps,
                 average_return,
                 tf_metrics.AverageEpisodeLengthMetric(prefix ='Train',
                                                       buffer_size = num_eval_episodes,
                                                       batch_size = train_env_tf.batch_size),
                ]

In [51]:
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.eval import metric_utils

In [54]:
eval_policy = greedy_policy.GreedyPolicy(tf_ddpg_agent.policy)

initial_collect_policy = random_tf_policy.RandomTFPolicy(train_env_tf.time_step_spec(), train_env_tf.action_spec())

collect_policy = tf_ddpg_agent.collect_policy

In [55]:
train_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(ROOT_DIR, 'train'),
                                         agent=tf_ddpg_agent,
                                         global_step=global_step,
                                         metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))

policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(ROOT_DIR, 'policy'),
                                          policy=eval_policy,
                                          global_step=global_step)

rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(ROOT_DIR, 'replay_buffer'),
                                      max_to_keep=1,
                                      replay_buffer=replay_buffer)

train_checkpointer.initialize_or_restore()

rb_checkpointer.initialize_or_restore()

<tensorflow.python.training.tracking.util.InitializationOnlyStatus at 0x7fca0c1d05b0>

In [57]:
from tf_agents.drivers import dynamic_step_driver

In [61]:
initial_collect_steps = 100
collect_steps_per_iteration = 1

In [69]:
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(train_env_tf,
                                                               initial_collect_policy,
                                                               observers=replay_observer + train_metrics,
                                                               num_steps=initial_collect_steps)

collect_driver = dynamic_step_driver.DynamicStepDriver(train_env_tf,
                                                       collect_policy,
                                                       observers=replay_observer + train_metrics,
                                                       num_steps=collect_steps_per_iteration)

initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_ddpg_agent.train = common.function(tf_ddpg_agent.train)

In [70]:
from absl import logging

In [71]:
# Collect initial replay data.
if env_steps.result() == 0 or replay_buffer.num_frames() == 0:
    logging.info(
      'Initializing replay buffer by collecting experience for %d steps'
      'with a random policy.', initial_collect_steps)
    initial_collect_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


ValueError: Dimension 0 in both shapes must be equal, but are 1 and 32. Shapes are [1] and [32]. for '{{node driver_loop/TFUniformReplayBuffer/ResourceScatterUpdate_1}} = ResourceScatterUpdate[Tindices=DT_INT64, dtype=DT_INT32](driver_loop/TFUniformReplayBuffer/ResourceScatterUpdate_1/resource, driver_loop/TFUniformReplayBuffer/add, driver_loop/Placeholder_1)' with input shapes: [], [32], [1].

### Data Collection

In [106]:
def collect_episode(environment, policy, num_episodes):

    episode_counter = 0
    traj_counter = 0
    environment.reset()
#     print(episode_counter)
    while episode_counter < num_episodes:
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step, next_time_step)
        traj_counter += 1
        # Add trajectory to the replay buffer
        
        if np.array(traj.reward)[0] != -0.1: 
            replay_buffer.add_batch(traj)           

        if traj.is_boundary():
            episode_counter += 1
    return traj_counter

In [107]:
collect_episode(train_env_tf, tf_ddpg_agent.collect_policy, 1)

1000

In [36]:
tf_ddpg_agent.train = common.function(tf_ddpg_agent.train)
tf_ddpg_agent.train_step_counter.assign(0)

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=0>

In [37]:
avg_return = compute_avg_return(eval_env_tf, tf_ddpg_agent.policy, NUM_EVAL_EPISODES)
returns = [avg_return]

In [None]:
for _ in tqdm(range(NUM_EPISODE)):
    collect_episode(train_env_tf, tf_ddpg_agent.collect_policy, 1)
    
#     experience = replay_buffer.as_dataset()
    experience, unused_info = next(iterator)
#     experience = replay_buffer.gather_all()
    train_loss = tf_ddpg_agent.train(experience)
#     replay_buffer.clear()

    step = tf_ddpg_agent.train_step_counter.numpy()
    
    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))
        train_return = compute_avg_return(eval_env_tf, tf_ddpg_agent.policy, NUM_EVAL_EPISODES)

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env_tf, tf_ddpg_agent.policy, NUM_EVAL_EPISODES)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [110]:
next(iterator)[1]

BufferInfo(ids=<tf.Tensor: shape=(32, 2), dtype=int64, numpy=
array([[405, 406],
       [484, 485],
       [534, 535],
       [525, 526],
       [861, 862],
       [ 15,  16],
       [265, 266],
       [997, 998],
       [974, 975],
       [459, 460],
       [104, 105],
       [715, 716],
       [867, 868],
       [901, 902],
       [898, 899],
       [ 87,  88],
       [232, 233],
       [241, 242],
       [205, 206],
       [876, 877],
       [  4,   5],
       [538, 539],
       [899, 900],
       [436, 437],
       [895, 896],
       [103, 104],
       [159, 160],
       [760, 761],
       [642, 643],
       [172, 173],
       [245, 246],
       [161, 162]])>, probabilities=<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.001001, 0.001001, 0.001001, 0.001001, 0.001001, 0.001001,
       0.001001, 0.001001, 0.001001, 0.001001, 0.001001, 0.001001,
       0.001001, 0.001001, 0.001001, 0.001001, 0.001001, 0.001001,
       0.001001, 0.001001, 0.001001, 0.001001, 0.001001, 0.001001