In [None]:
import tensorflow as tf
import numpy as np
import math

LEARNING_RATE = 0.0001
TAU = 0.001
BATCH_SIZE = 64
N_HIDDEN_1 = 400
N_HIDDEN_2 = 300

class ActorNetwork(tf.Module):
    """Backbone Network for the Actor"""
    
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        
        # Network parameters
        self.W1 = tf.Variable(tf.random.uniform([num_states, N_HIDDEN_1], -1/math.sqrt(num_states), 1/math.sqrt(num_states)))
        self.B1 = tf.Variable(tf.random.uniform([N_HIDDEN_1], -1/math.sqrt(num_states), 1/math.sqrt(num_states)))
        self.W2 = tf.Variable(tf.random.uniform([N_HIDDEN_1, N_HIDDEN_2], -1/math.sqrt(N_HIDDEN_1), 1/math.sqrt(N_HIDDEN_1)))
        self.B2 = tf.Variable(tf.random.uniform([N_HIDDEN_2], -1/math.sqrt(N_HIDDEN_1), 1/math.sqrt(N_HIDDEN_1)))
        self.W3 = tf.Variable(tf.random.uniform([N_HIDDEN_2, num_actions], -0.003, 0.003))
        self.B3 = tf.Variable(tf.random.uniform([num_actions], -0.003, 0.003))
        
        self.batch_norm1 = tf.keras.layers.BatchNormalization()
        self.batch_norm2 = tf.keras.layers.BatchNormalization()
    
    def __call__(self, state, training):
        h1_t = tf.matmul(state, self.W1)
        h1_bn = self.batch_norm1(h1_t, training=training)
        h1 = tf.nn.softplus(h1_bn) + self.B1
        
        h2_t = tf.matmul(h1, self.W2)
        h2_bn = self.batch_norm2(h2_t, training=training)
        h2 = tf.nn.tanh(h2_bn) + self.B2
        
        return tf.matmul(h2, self.W3) + self.B3
    
    def get_variables(self):
        return [self.W1, self.B1, self.W2, self.B2, self.W3, self.B3]

class Actor:
    """Actor that handles the network training and target update"""
    
    def __init__(self, num_states, num_actions):
        # Initialize main and target networks
        self.actor_network = ActorNetwork(num_states, num_actions)
        self.target_network = ActorNetwork(num_states, num_actions)
        
        # Initialize target network with same weights as main network
        self.update_target_actor(initial=True)
        
        # Optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)

    def evaluate_actor(self, state):
        return self.actor_network(state, training=False)
    
    def evaluate_target_actor(self, state):
        return self.target_network(state, training=False)
    
    def train_actor(self, actor_state_in, q_gradient_input):
        with tf.GradientTape() as tape:
            actions = self.actor_network(actor_state_in, training=True)
            actor_parameters = self.actor_network.get_variables()
            gradients = tape.gradient(actions, actor_parameters, output_gradients=-q_gradient_input)
        
        self.optimizer.apply_gradients(zip(gradients, actor_parameters))
    
    def update_target_actor(self, initial=False):
        if initial:
            for target_var, var in zip(self.target_network.get_variables(), self.actor_network.get_variables()):
                target_var.assign(var)
        else:
            for target_var, var in zip(self.target_network.get_variables(), self.actor_network.get_variables()):
                target_var.assign(TAU * var + (1 - TAU) * target_var)
      

In [None]:
import tensorflow as tf
import math
import numpy as np

LEARNING_RATE = 0.001
TAU = 0.001
BATCH_SIZE = 64
N_HIDDEN_1 = 400
N_HIDDEN_2 = 300

class CriticNetwork(tf.Module):
    """Critic Q value model backbone with batch normalization for DDPG"""
    
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        
        # Critic Network weights and biases
        self.W1_c = tf.Variable(tf.random.uniform([num_states, N_HIDDEN_1], -1/math.sqrt(num_states), 1/math.sqrt(num_states)))
        self.B1_c = tf.Variable(tf.random.uniform([N_HIDDEN_1], -1/math.sqrt(num_states), 1/math.sqrt(num_states)))
        
        self.W2_c = tf.Variable(tf.random.uniform([N_HIDDEN_1, N_HIDDEN_2], -1/math.sqrt(N_HIDDEN_1 + num_actions), 1/math.sqrt(N_HIDDEN_1 + num_actions)))
        self.B2_c = tf.Variable(tf.random.uniform([N_HIDDEN_2], -1/math.sqrt(N_HIDDEN_1 + num_actions), 1/math.sqrt(N_HIDDEN_1 + num_actions)))
        
        self.W2_action_c = tf.Variable(tf.random.uniform([num_actions, N_HIDDEN_2], -1/math.sqrt(N_HIDDEN_1 + num_actions), 1/math.sqrt(N_HIDDEN_1 + num_actions)))
        
        self.W3_c = tf.Variable(tf.random.uniform([N_HIDDEN_2, 1], -0.003, 0.003))
        self.B3_c = tf.Variable(tf.random.uniform([1], -0.003, 0.003))
        
        self.batch_norm1 = tf.keras.layers.BatchNormalization()
        self.batch_norm2 = tf.keras.layers.BatchNormalization()
    
    def __call__(self, state, action, training):
        h1_t = tf.matmul(state, self.W1_c)
        h1_c_bn = self.batch_norm1(h1_t, training=training)
        h1_c = tf.nn.softplus(h1_c_bn) + self.B1_c
        
        h2_t = tf.matmul(h1_c, self.W2_c) + tf.matmul(action, self.W2_action_c)
        h2_c_bn = self.batch_norm2(h2_t, training=training)
        h2_c = tf.nn.tanh(h2_c_bn) + self.B2_c
        
        q_value = tf.matmul(h2_c, self.W3_c) + self.B3_c
        return q_value
    
    def get_variables(self):
        return [self.W1_c, self.B1_c, self.W2_c, self.B2_c, self.W2_action_c, self.W3_c, self.B3_c]


class Critic:
    """Critic that handles network training, target update, and other tasks"""
    
    def __init__(self, num_states, num_actions):
        # Initialize main and target networks
        self.critic_network = CriticNetwork(num_states, num_actions)
        self.target_network = CriticNetwork(num_states, num_actions)
        
        # Initialize target network with same weights as main network
        self.update_target_critic(initial=True)
        
        # Optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    def train_critic(self, state_batch, action_batch, y_batch):
        with tf.GradientTape() as tape:
            q_values = self.critic_network(state_batch, action_batch, training=True)
            l2_loss = 0.0001 * tf.reduce_sum(tf.square(self.critic_network.W2_c))
            cost = tf.reduce_mean(tf.square(q_values - y_batch)) + l2_loss
        
        critic_variables = self.critic_network.get_variables()
        gradients = tape.gradient(cost, critic_variables)
        self.optimizer.apply_gradients(zip(gradients, critic_variables))

    def evaluate_target_critic(self, state_batch, action_batch):
        return self.target_network(state_batch, action_batch, training=False)
    
    def compute_delQ_a(self, state_batch, action_batch):
        with tf.GradientTape() as tape:
            tape.watch(action_batch)
            q_values = self.critic_network(state_batch, action_batch, training=False)
        return tape.gradient(q_values, action_batch)
    
    def update_target_critic(self, initial=False):
        if initial:
            for target_var, var in zip(self.target_network.get_variables(), self.critic_network.get_variables()):
                target_var.assign(var)
        else:
            for target_var, var in zip(self.target_network.get_variables(), self.critic_network.get_variables()):
                target_var.assign(TAU * var + (1 - TAU) * target_var)


In [None]:
### batch_normol

import tensorflow.compat.v1 as tf

tf.disable_v2_behavior()
decay = 0.95
TAU = 0.001

class batch_norm:
    def __init__(self,inputs,size,is_training,sess,parForTarget=None,bn_param=None):
        
        self.sess = sess        
        self.scale = tf.Variable(tf.random_uniform([size],0.9,1.1))
        self.beta = tf.Variable(tf.random_uniform([size],-0.01,0.01))
        self.pop_mean = tf.Variable(tf.random_uniform([size],-0.01,0.01),trainable=False)
        self.pop_var = tf.Variable(tf.random_uniform([size],0.9,1.1),trainable=False)        
        self.batch_mean, self.batch_var = tf.nn.moments(inputs,[0])        
        self.train_mean = tf.assign(self.pop_mean,self.pop_mean * decay + self.batch_mean * (1 - decay))  
        self.train_var = tf.assign(self.pop_var,self.pop_var * decay + self.batch_var * (1 - decay))
                
        def training(): 
            return tf.nn.batch_normalization(inputs,
                self.batch_mean, self.batch_var, self.beta, self.scale, 0.0000001 )
    
        def testing(): 
            return tf.nn.batch_normalization(inputs,
            self.pop_mean, self.pop_var, self.beta, self.scale, 0.0000001)
        
        if parForTarget!=None:
            self.parForTarget = parForTarget
            self.updateScale = self.scale.assign(self.scale*(1-TAU)+self.parForTarget.scale*TAU)
            self.updateBeta = self.beta.assign(self.beta*(1-TAU)+self.parForTarget.beta*TAU)
            self.updateTarget = tf.group(self.updateScale, self.updateBeta)
    
        self.bnorm = tf.cond(is_training,training,testing)
    
    def update_Target(self):
        self.sess.run(self.updateBeta)
        self.sess.run(self.updateScale)

In [1]:
## embedding

import tensorflow as tf 
import numpy as np 

class VideoGenreEmbedding(tf.keras.Model):
    def __init__(self, len_videos, len_genres, embedding_dim):
        super(VideoGenreEmbedding, self).__init__()
        self.m_g_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        # embedding
        self.m_embedding = tf.keras.layers.Embedding(name='video_embedding', input_dim=len_videos, output_dim=embedding_dim)
        self.g_embedding = tf.keras.layers.Embedding(name='genre_embedding', input_dim=len_genres, output_dim=embedding_dim)
        # dot product
        self.m_g_merge = tf.keras.layers.Dot(name='video_genre_dot', normalize=True, axes=1)
        # output
        self.m_g_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def __call__(self, x):
        # x = self.m_g_input(x)
        memb = self.m_embedding(x[0])
        gemb = self.g_embedding(x[1])
        m_g = self.m_g_merge([memb, gemb])
        return self.m_g_fc(m_g)

class UserVideoEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_videos, embedding_dim):
        super(UserVideoEmbedding, self).__init__()
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        # embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        self.m_embedding = tf.keras.layers.Embedding(name='video_embedding', input_dim=len_videos, output_dim=embedding_dim)
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='video_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, x):
        # x = self.m_u_input(x)
        uemb = self.u_embedding(x[0])
        memb = self.m_embedding(x[1])
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

In [None]:
# enviroment

import numpy as np
from collections import deque

class StimulateEnv(object):
    
    def __init__(self, user_id, newest_watched_video, users_dict, users_history_lens, state_size):
        
        self.user_id = user_id
        self.state_size = state_size 
        self.newest_watched_video = newest_watched_video
        self.users_dict = users_dict
        self.users_history_lens = users_history_lens
        
        self.user_items = {data[0]:data[1] for data in self.users_dict[self.user_id]} #{'video_id': 'rated'}
        self.items = [data[0] for data in self.users_dict[self.user_id]] #[:self.state_size]]
        self.done = False 
        self.old_watched = set(self.items) 
        self.done_count = 3000
        
    def step(self, recommend_item):
        
        reward = -0.5 
        correctly_recommended = [] 
        rewards = [] 
        
        if self.newest_watched_video in recommend_item and recommend_item not in self.old_watched:
            correctly_recommended.append(recommend_item)
            rewards.append(1)
        else:
            rewards.append(-0.5)
        
        deque_old_watched = deque(self.old_watched) 
        deque_old_watched.append(self.newest_watched_video)
        deque_old_watched.popleft()

        if max(rewards) > 0: 
            self.items = self.items[len(correctly_recommended):] + correctly_recommended 
        
        reward = rewards
        
        if len(self.old_watched) > self.done_count or len(self.old_watched) >= self.users_history_lens:
            self.done = True
        
        return self.items, reward, self.done, self.old_watched
    
    """ 
    def reset(self):
        self.user_id = np.random.choice(self.)
        
    Too lazy, this part will serves only stimulation stage when RESET auto generate random user
    and other information to start the algo
    
    """
    
    

In [None]:
## ou noise

# --------------------------------------
# Ornstein-Uhlenbeck Noise
# Author: Flood Sung
# Date: 2016.5.4
# Reference: https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py
# --------------------------------------

import numpy as np
import numpy.random as nr

class OUNoise:
    """docstring for OUNoise"""
    def __init__(self,action_dimension,mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x))
        self.state = x + dx
        return self.state

if __name__ == '__main__':
    ou = OUNoise(3)
    states = []
    for i in range(1000):
        states.append(ou.noise())
    import matplotlib.pyplot as plt

    plt.plot(states)
    plt.show()

In [None]:
## replay 

from collections import deque
import random

class ReplayBuffer(object):

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.num_experiences = 0
        self.buffer = deque()

    def get_batch(self, batch_size):
        # Randomly sample batch_size examples
        return random.sample(self.buffer, batch_size)

    def size(self):
        return self.buffer_size

    def add(self, state, action, reward, new_state, done):
        experience = (state, action, reward, new_state, done)
        if self.num_experiences < self.buffer_size:
            self.buffer.append(experience)
            self.num_experiences += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def count(self):
        # if buffer is full, return buffer size
        # otherwise, return experience counter
        return self.num_experiences

    def erase(self):
        self.buffer = deque()
        self.num_experiences = 0

In [None]:
## state represent

import tensorflow as tf
import numpy as np

class DRRAveStateRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim, output_dim):
        super(DRRAveStateRepresentation, self).__init__()
        self.embedding_dim = embedding_dim
        self.wav = tf.keras.layers.Conv1D(1, 1, 1)
        self.concat = tf.keras.layers.Concatenate()
        self.flatten = tf.keras.layers.Flatten()
        
        self.output_dim = output_dim
        self.dense = tf.keras.layers.Dense(output_dim, activation=None)
        
    def call(self, x):
        items_eb = tf.transpose(x[1], perm=(0,2,1))/self.embedding_dim
        wav = self.wav(items_eb)
        wav = tf.transpose(wav, perm=(0,2,1))
        wav = tf.squeeze(wav, axis=1)
        user_wav = tf.keras.layers.multiply([x[0], wav])
        concat = self.concat([x[0], user_wav, wav])
        
        flattened = self.flatten(concat)
        output = self.dense(flattened)
        return output

In [None]:
## tensor grad_inverted

import tensorflow as tf

class GradInverter:
    def __init__(self, action_bounds):
        self.action_size = len(action_bounds[0])
        self.pmax = tf.constant(action_bounds[0], dtype=tf.float32)
        self.pmin = tf.constant(action_bounds[1], dtype=tf.float32)
        self.prange = tf.constant([x - y for x, y in zip(action_bounds[0], action_bounds[1])], dtype=tf.float32)

    def invert(self, grad, action):
        # Ensure inputs are tensors
        action_input = tf.convert_to_tensor(action, dtype=tf.float32)
        act_grad = tf.convert_to_tensor(grad, dtype=tf.float32)

        pdiff_max = (-action_input + self.pmax) / self.prange
        pdiff_min = (action_input - self.pmin) / self.prange
        zeros_act_grad_filter = tf.zeros_like(act_grad)

        # Perform element-wise comparison and selection
        grad_inverter = tf.where(tf.greater(act_grad, zeros_act_grad_filter), act_grad * pdiff_max, act_grad * pdiff_min)
        
        return grad_inverter.numpy()  # Convert the result back to numpy if necessary


In [None]:

https://github.com/cookbenjamin/DDPG/blob/master/ddpg.py : Lấy thông tin note các files 


### Tham khảo code

https://github.com/openai/baselines/tree/master/baselines/ddpg: OpenAI 
https://github.com/stevenpjg/ddpg-aigym/tree/master: CHÍNH 

https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html: tham khảo
https://github.com/yanpanlau/DDPG-Keras-Torcs/blob/master/ddpg.py : kham thảo 

https://github.com/m5823779/motion-planner-reinforcement-learning
https://github.com/m5823779/DDPG/blob/master/ddpg/action_dim%3D1%20(success)/ddpg.py : cấu trúc file rõ ràng - CHÍNH

https://github.com/samkoesnadi/DDPG-tf2/blob/master/src/model.py: chưa thấy phần Batch-normalization

### Đọc thêm 
https://github.com/ghliu/pytorch-ddpg?tab=readme-ov-file



In [None]:
## ddpg 
import numpy as np
from collections import deque
import random
import tensorflow as tf
from model.embedding import UserVideoEmbedding
import sys,os
sys.path.append(os.getcwd()) 

from model.actor import Actor
from model.critic import Critic
from model.tensorflow_grad_inverter import GradInverter as grad_inverter
from model.state_representation import DRRAveStateRepresentation

REPLAY_MEMORY_SIZE = 10000
BATCH_SIZE = 64
GAMMA=0.99
is_grad_inverter = True

cwd = '/home/tuannm84/Desktop/myclip/vtcc-myclip-recommender-system-v2/myclip_recommender_v2/asset/'
class DDPG(object):
    """ Deep Deterministic Policy Gradient Algorithm """

    def __init__(self, env, users_num, items_num, num_actions, STATE_SIZE, output_dim):
        self.env = env
        self.num_states = STATE_SIZE
        self.num_actions = num_actions ## Number of video to be choosed

        # Initialize Actor and Critic networks
        self.critic_net = Critic(self.num_states, self.num_actions)
        self.actor_net = Actor(self.num_states, self.num_actions)

        # Initialize Replay Memory
        self.replay_memory = deque()

        # Initialize time step
        self.time_step = 0
        self.counter = 0

        action_max = [num_actions]
        action_min = [1]
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)
        
        self.embedding_dim = 100
        self.embedding_network = UserVideoEmbedding(users_num, items_num, self.embedding_dim)
        embedding_save_file_dir = os.path.join(cwd, 'dataset/save_weights/user_movie_embedding_case4.h5') #m_g_model_weights.weights.h5'
        assert os.path.exists(embedding_save_file_dir), f"embedding save file directory: '{embedding_save_file_dir}' is wrong."
        self.embedding_network.built = True
        self.embedding_network.load_weights(embedding_save_file_dir, by_name = True, skip_mismatch = True)
        
        self.srm_ave = DRRAveStateRepresentation(self.embedding_dim, output_dim)
        self.srm_ave([np.zeros((1, 100,)),np.zeros((1, STATE_SIZE, 100))])
        
    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)

    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1 # previous 
        self.observation_2 = observation_2 # newest 
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward, self.done))
        self.time_step += 1
        if len(self.replay_memory) > REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        # state t
        self.state_t_batch = np.array([item[0] for item in batch])
        # state t+1
        self.state_t_1_batch = np.array([item[1] for item in batch])
        self.action_batch = np.array([item[2] for item in batch]).reshape(len(batch), self.num_actions)
        self.reward_batch = np.array([item[3] for item in batch])
        self.done_batch = np.array([item[4] for item in batch])

    def train(self):
        # Sample a random minibatch of N transitions from replay memory
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        # Q'(s_i+1,a_i+1)
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch, self.action_t_1_batch)
        self.y_i_batch = []

        for i in range(BATCH_SIZE):
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                self.y_i_batch.append(self.reward_batch[i] + GAMMA * q_t_1[i][0])

        self.y_i_batch = np.array(self.y_i_batch).reshape(len(self.y_i_batch), 1)

        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch, self.y_i_batch)

        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch)

        if is_grad_inverter:
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch, action_for_delQ)
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ)
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch, action_for_delQ)[0]

        # Train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch, self.del_Q_a)

        # Update target Critic and Actor networks
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
                     
                     
    def recommend_item(self, action, all_items, old_watched, top_k=False, items_ids=None):
        if items_ids is None:
            items_ids = np.array(list(set(all_items) - set(old_watched)))
            # items_ids = np.array(list(set(self.items_list) - set(recommended_items)))

        
        items_ebs = self.embedding_network.get_layer('video_embedding')(items_ids)
        action = tf.expand_dims(action, axis=1)
        if top_k:
            item_indice = np.argsort(tf.transpose(tf.reduce_sum((items_ebs* action), axis=1, keepdims=True), perm=(1, 0)))[0][-top_k:]
            return items_ids[item_indice]
        else:
            item_idx = np.argmax(tf.transpose(tf.reduce_sum((items_ebs* action), axis=1, keepdims=True), perm=(1, 0)))
            return items_ids[item_idx]
                 
        




In [None]:
## run 

import tensorflow as tf
import numpy as np
from tensorflow.python.ops.gen_math_ops import Exp
from datetime import datetime
import os
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import time
import random
import sys 
# sys.path.append('/home/tuannm84/Desktop/longbien/Project/MC/')

from model.actor import Actor
from model.critic import Critic
from model.enviroment import StimulateEnv
from model.ddpg import DDPG 
from model.embedding import VideoGenreEmbedding, UserVideoEmbedding
from model.ou_noise import OUNoise


Path = '/home/tuannm84/Desktop/longbien/Project/MC'

PATH_USER_DICT = os.path.join(Path, "dataset/user_dict.npy")
PATH_TRAIN_DATASET = os.path.join(Path, "dataset/train_dict.npy")
PATH_EVAL_DATSET = os.path.join(Path, "dataset/eval_dict.npy")
PATH_USER_HISTORY_LENS = os.path.join(Path, 'dataset/users_history_len_local.npy')
PATH_DICTIONARY = os.path.join(Path, "dataset/dictionary.npy")
PATH_DATA_NUMBER = os.path.join(Path, "dataset/data_number.npy")

users_dict = np.load(PATH_USER_DICT,allow_pickle='TRUE').item()
eval_users_dict = np.load(PATH_EVAL_DATSET,allow_pickle='TRUE').item()
train_users_dict = np.load(PATH_TRAIN_DATASET,allow_pickle='TRUE').item()
dictionary = np.load(PATH_DICTIONARY,allow_pickle='TRUE').item()
data_number = np.load(PATH_DATA_NUMBER,allow_pickle='TRUE').item()
users_history_lens = np.load(PATH_USER_HISTORY_LENS, allow_pickle='TRUE')
all_items = {data[0] for i, k in users_dict.items() for data in k}  ## list video toan tap data 

user_dataset = eval_users_dict
user_id = 10 
users_history_lens = len(user_dataset[user_id])
newest_watched_video = np.random.choice([i[0] for i in user_dataset[11]])
watched_videos = [data[0] for data in users_dict[user_id]]
items_ids = np.array(list(set(all_items) - set(watched_videos)))
len_items_ids = len(items_ids)
STATE_SIZE = len_items_ids ## 1445 là số lượng videos sau khi trừ đi các video đã xem trong history
num_actions = len_items_ids
output_dim = len_items_ids
users_num = data_number['users_num']
items_num = data_number['items_num']

env_prod = StimulateEnv(user_id, newest_watched_video, users_dict, users_history_lens, STATE_SIZE)
recommender  = DDPG(env_prod, users_num, items_num, num_actions, STATE_SIZE, output_dim)  # output_dim là output của State_emebedding, để 1445 vì đầu vào của actor.evaluate_actor là (1445,400)

EMBEDDING_SIZE = 100
epsilon_for_priority = 1e-6
batch_size = 32
num_actions = len_items_ids ## Number of list video to be choosed
num_actions = len_items_ids
exploration_noise = OUNoise(num_actions)


#######################################################################################
"""
CASE 1: STATE from ITEMS_IDS => feed into EVALUATE_ACTOR => ACTION  
"""
x = items_ids
array_x = np.reshape(x,[1, num_actions])
state_value = tf.convert_to_tensor(x, dtype=tf.float32)
state_value = tf.expand_dims(state_value, axis=0)



""" 
CASE 2: STATE from ITEM_IDS + STATE_REPRESENTATION + USER_EMDS => STATE => feed into EVALUATE_ACTOR => ACTION 
"""
user_eb = recommender.embedding_network.get_layer('user_embedding')(np.array(user_id))
items_eb = recommender.embedding_network.get_layer('video_embedding')(np.array(items_ids))
state = recommender.srm_ave([np.expand_dims(user_eb, axis=0), np.expand_dims(items_eb, axis=0)])
action = recommender.evaluate_actor(state)



#######################################################################################

action = recommender.evaluate_actor(state_value)
noise = exploration_noise.noise()
action = action[0] + noise
recommended_item = recommender.recommend_item(action, all_items, env_prod.old_watched, top_k= 5)
next_items_ids_embs, reward, done, _ = env_prod.step(recommended_item)
reward = np.sum(reward)

# agent.add_experience(state_value, next_items_ids,action,reward,done)



""" 
2 cách trên đang ra cùng 1 kết quả (xem tại jupyter notebook)

tiếp theo: Hoàn thiện Circle workflow - phần train và phần minibatch thử chạy xem tiếp ntn và cái replay hoạt động ổn k ra sao

"""