In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.python.ops.gen_math_ops import Exp
from datetime import datetime
import os
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import time
import random
import sys 

from model.actor import Actor
from model.critic import Critic
from model.enviroment import StimulateEnv
from model.ddpg import DDPG 
from model.embedding import VideoGenreEmbedding, UserVideoEmbedding
from model.ou_noise import OUNoise

In [2]:
hub = '/Users/minhtuan/Documents/Documents/Work/Hanoi/Hub'
path = '/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/X/movielens/airflow_folder/drl_melody/storage/dataset1M'

In [3]:
PATH_USER_DICT = os.path.join(hub, "user_dict.npy")
PATH_EVAL_DATSET = os.path.join(hub, "eval_dict.npy")
PATH_USER_HISTORY_LENS = os.path.join(hub, 'users_history_len.npy')
PATH_DATA_NUMBER = os.path.join(hub, "data_number.npy")

users_dict = np.load(PATH_USER_DICT,allow_pickle='TRUE').item()
eval_users_dict = np.load(PATH_EVAL_DATSET,allow_pickle='TRUE').item()
data_number = np.load(PATH_DATA_NUMBER,allow_pickle='TRUE').item()
users_history_lens = np.load(PATH_USER_HISTORY_LENS, allow_pickle='TRUE')
all_items = {data[0] for i, k in users_dict.items() for data in k}  ## list video toan tap data 

user_dataset = eval_users_dict
user_id = 4834

users_history_lens = round(len(user_dataset[user_id]) * 0.6) ## split data in EVAL into 2 pieces: one for history and other for streaming 
watched_videos = [data[0] for data in user_dataset[user_id]][:users_history_lens]
user_dict_history = {user_id: eval_users_dict[user_id][:users_history_lens]}
# newest_watched_video = np.random.choice([i[0] for i in user_dataset[user_id]])
newest_watched_video = [data[0] for data in user_dataset[user_id]][users_history_lens:][0]
items_ids = np.array(list(set(all_items) - set(watched_videos)))
len_items_ids = len(items_ids)
users_num = data_number['users_num']
items_num = data_number['items_num']

STATE_SIZE =  5 #len_items_ids ## 1445 là số lượng videos sau khi trừ đi các video đã xem trong history
num_actions =  5 #len_items_ids ## Number of list video to be choosed
output_dim = 5 #len_items_ids

env_prod = StimulateEnv(user_id, newest_watched_video, user_dict_history, users_history_lens, STATE_SIZE)
recommender  = DDPG(env_prod, users_num, items_num, num_actions, STATE_SIZE, output_dim)  # output_dim là output của State_emebedding, để 1445 vì đầu vào của actor.evaluate_actor là (1445,400)


EMBEDDING_SIZE = 100
epsilon_for_priority = 1e-6
batch_size = 32
exploration_noise = OUNoise(num_actions)

## item_ids should comes from history of user 

# x = items_ids
x = watched_videos[- STATE_SIZE:]
array_x = np.reshape(x,[1, num_actions])
state_value = tf.convert_to_tensor(x, dtype=tf.float32)
state_value = tf.expand_dims(state_value, axis=0)

action = recommender.evaluate_actor(state_value) ## create a ranking video weight 
noise = exploration_noise.noise()
action = action[0] + noise

recommended_item = recommender.recommend_item(action, all_items, env_prod.old_watched, top_k= 5) ## create a list of suggested video 
next_items_ids_embs, reward, done, _ = env_prod.step(recommended_item)

array_x_next = np.reshape(next_items_ids_embs,[1, num_actions])
state_value_next = tf.convert_to_tensor(next_items_ids_embs, dtype=tf.float32)
state_value_next = tf.expand_dims(state_value_next, axis=0)

reward = np.sum(reward)

recommender.add_experience(state_value, state_value_next, action, reward, done)


TypeError: StimulateEnv.__init__() takes 3 positional arguments but 6 were given

#### Note flow 

reset will take out 5 videos from history of user, 5 videos 
        -> embedding -> state -> actor.network 
        -> action (ranking video base on user) -> feed on recommender.recommend_item (take out video to suggest) 

In [34]:
recommended_item, env_prod.old_watched, newest_watched_video, x

(array([2538,  851,  838, 1880, 2900]),
 [2395, 2762, 2724, 314, 2702],
 2840,
 [2395, 2762, 2724, 314, 2702])

1/9: finish flow evaluate_actor and predict

next step: replay_buffer and train sections

## TRAIN SECTION

In [5]:
PATH_USER_DICT = os.path.join(hub, "user_dict.npy")
PATH_EVAL_DATSET = os.path.join(hub, "eval_dict.npy")
PATH_USER_HISTORY_LENS = os.path.join(hub, 'users_history_len.npy')
PATH_DATA_NUMBER = os.path.join(hub, "data_number.npy")

users_dict = np.load(PATH_USER_DICT,allow_pickle='TRUE').item()
eval_users_dict = np.load(PATH_EVAL_DATSET,allow_pickle='TRUE').item()
data_number = np.load(PATH_DATA_NUMBER,allow_pickle='TRUE').item()
users_history_lens = np.load(PATH_USER_HISTORY_LENS, allow_pickle='TRUE')
all_items = {data[0] for i, k in users_dict.items() for data in k}  ## list video toan tap data 

user_dataset = eval_users_dict
users_num = data_number['users_num']
items_num = data_number['items_num']

STATE_SIZE = 5
num_actions = 5
output_dim = 5

user_id = 4833
users_history_lens = round(len(eval_users_dict[user_id]) * 0.6)
watched_videos = [video[0] for video in eval_users_dict[user_id]][:users_history_lens]
newest_watched_video = [video[0] for video in eval_users_dict[user_id]][users_history_lens:][0]
enviroment = StimulateEnv(user_id, newest_watched_video, eval_users_dict, users_history_lens,STATE_SIZE)

In [7]:

#specify parameters here:
episodes=10000

#Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
agent  = DDPG(enviroment, users_num, items_num, num_actions, STATE_SIZE, output_dim)  # output_dim là output của State_emebedding, để 1445 vì đầu vào của actor.evaluate_actor là (1445,400)
exploration_noise = OUNoise(num_actions)
counter=0
reward_per_episode = 0    
total_reward=0
steps = 20
#saving reward:
reward_st = np.array([0])
    

for i in range(0, episodes):
    print("==== Starting episode no:",i,"====","\n")
    user_id, watched_videos, done = enviroment.reset()

    # user_id = 4833
    users_history_lens = round(len(eval_users_dict[user_id]) * 0.6)
    watched_videos = [video[0] for video in eval_users_dict[user_id]][:users_history_lens]
    newest_watched_video = [video[0] for video in eval_users_dict[user_id]][users_history_lens:]
    newest_watched_video_start = [video[0] for video in eval_users_dict[user_id]][users_history_lens:][i]
    enviroment = StimulateEnv(user_id, newest_watched_video_start, eval_users_dict, users_history_lens,STATE_SIZE)
    agent  = DDPG(enviroment, users_num, items_num, num_actions, STATE_SIZE, output_dim)  # output_dim là output của State_emebedding, để 1445 vì đầu vào của actor.evaluate_actor là (1445,400)

    # users_history_lens =  round(len(user_dataset[user_id]) * 0.6)
    # watched_videos =  [data[0] for data in eval_users_dict[user_id]][:users_history_lens]
    reward_per_episode = 0
    for t in range(0, steps):
        x = watched_videos[- STATE_SIZE:]
        ## change shape to fit evaluate_actor 
        state_value = tf.convert_to_tensor(x, dtype=tf.float32)
        state_value = tf.expand_dims(state_value, axis=0)
        print(np.reshape(state_value,[1, num_actions]))
        action = agent.evaluate_actor(np.reshape(state_value,[1, num_actions]))
        noise = exploration_noise.noise()
        action = action[0] + noise #Select action according to current policy and exploration noise
        # print("Action at step", t ," :",action,"\n")
        
        next_items_ids_embs, reward, done, _= enviroment.step(action)
        state_value_next = tf.convert_to_tensor(next_items_ids_embs, dtype=tf.float32)
        state_value_next = tf.expand_dims(state_value_next, axis=0)
        state_value_next = np.reshape(state_value_next,[1, num_actions])
        print("_"*64)
        print(state_value_next)
        #add s_t,s_t+1,action,reward to experience memory
        agent.add_experience(state_value, state_value_next, action, reward, done)
        #train critic and actor network
        if counter > 5: 
            agent.train()
        reward_per_episode+=reward[0]
        counter+=1
        #check if episode ends:
        if (done or (t == steps-1)):
            print('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
            print("Printing reward to file")
            exploration_noise.reset() #reinitializing random noise for action exploration
            reward_st = np.append(reward_st,reward_per_episode)
            np.savetxt('episode_reward.txt',reward_st, newline="\n")
            print('\n\n')
            break
        watched_videos.append(newest_watched_video[t])
total_reward+=reward_per_episode            
print("Average reward per episode {}".format(total_reward / episodes)) 


  _load_state(
  _load_state(
  _load_state(
  _load_state(
  _load_state(
  _load_state(


==== Starting episode no: 0 ==== 

[[ 750.  608.  919. 1198. 1196.]]
3504 tf.Tensor([0.02020714 0.24790782 0.02709563 0.41056746 0.297307  ], shape=(5,), dtype=float32)
________________________________________________________________
[[1259. 1208.  508. 1265. 3361.]]
[[ 608.  919. 1198. 1196. 3504.]]
3504 tf.Tensor([ 0.327122    0.03632209 -0.20857538  0.09027514  0.3780679 ], shape=(5,), dtype=float32)
________________________________________________________________
[[1259. 1208.  508. 1265. 3361.]]
[[ 919. 1198. 1196. 3504. 1234.]]
3504 tf.Tensor([ 0.95536685 -0.19035113 -0.39501965  0.21053302  0.5716473 ], shape=(5,), dtype=float32)
________________________________________________________________
[[1259. 1208.  508. 1265. 3361.]]
[[1198. 1196. 3504. 1234.  593.]]
3504 tf.Tensor([ 0.8368488  -0.08864509 -0.07643326  0.14995944  0.22336231], shape=(5,), dtype=float32)
________________________________________________________________
[[1259. 1208.  508. 1265. 3361.]]
[[1196. 3504. 1234

ValueError: Input 0 of layer "batch_normalization_36" is incompatible with the layer: expected ndim=2, found ndim=3. Full shape received: (5, 1, 400)

In [19]:
len(random.sample(agent.replay_memory, 5))



5