In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import gym
from gym import spaces
import torch
import torch.nn as nn
import numpy as np
from collections import deque
import random
from itertools import count
import torch.nn.functional as F
from tensorboardX import SummaryWriter

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device.type)

cuda


In [4]:
class ValueNetwork(nn.Module):
    def __init__(self):
        super(ValueNetwork, self).__init__()

        self.fc1 = nn.Linear(512, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 512)

    def forward(self, dialog_current_state):
        y = self.relu(self.fc1(dialog_current_state))
        y = self.relu(self.fc2(y))
        current_state = self.fc3(y)

        return current_state

    def current_state(self, dialog_current_state):
        with torch.no_grad():
            current_state = self.forward(dialog_current_state)
            
        return current_state
    
valueNetwork = ValueNetwork().to(device)
dialog_current_state = torch.randn((512,), dtype=torch.float32, device=device)

In [5]:

class DialogEnv(gym.Env):
    def __init__(self):
        super(DialogEnv, self).__init__()
        # Định nghĩa không gian hành động và không gian trạng thái
        self.action_space = spaces.Discrete(2)  # Hành động là 0 hoặc 1
        self.observation_space = spaces.Box(low=-float('inf'), high=float('inf'), shape=())  # Không gian trạng thái là không biết trước

        # Khởi tạo vector trạng thái ban đầu
        self.state = valueNetwork.current_state(dialog_current_state).to(device)

        self.current_state = None
        self.max_steps = 20
        self.current_step = 0

    def step(self, action):
        self.current_step += 1
        done = False

        # Xác định phần thưởng dựa trên hành động và trạng thái hiện tại
        if action == 0: # recommend
            if np.random.random() < 0.8:  # Khả năng thành công
                reward = 1  # rec_suc: cộng nhiều điểm
            else:
                reward = -1  # rec_fail: trừ nhiều điểm
        else:  # ask
            if np.random.random() < 0.6:  # Khả năng thành công
                reward = 0.3  # ask_suc: cộng ít điểm
            else:
                reward = -0.1  # ask_fail: trừ ít điểm

        # Diễn ra tình hình trạng thái tiếp theo
        if self.current_step >= self.max_steps:
            reward -= 0.7
            done = True
        else:
            self.current_state = valueNetwork.current_state(dialog_current_state).to(device)

        return self.current_state, reward, done, {}

    def reset(self):
        # Khởi tạo trạng thái ban đầu
        self.current_step = 0
        self.state = valueNetwork.current_state(dialog_current_state).to(device)
        return self.state


# Đăng ký môi trường
gym.register("DialogEnv-v0", entry_point=DialogEnv)

# Sử dụng môi trường
env = gym.make("DialogEnv-v0")


  logger.warn(


In [6]:
# env.reset()
# for _ in range(21):
    
#     action = env.action_space.sample()  # Lấy một hành động ngẫu nhiên

#     state, reward, done, info = env.step(action)

#     print("State:", state, "Action:", action, "Reward:", reward, 'Info:', info, "Current_step:", env.current_step)
#     if done:
#         print("Episode finished after", _ + 1, "timesteps")
#         break


In [7]:

class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(512, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 2)

    def forward(self, state):
        y = self.relu(self.fc1(state))
        y = self.relu(self.fc2(y))
        Q = self.fc3(y)

        return Q

    def select_action(self, state):
        with torch.no_grad():
            Q = self.forward(state)
            action_index = torch.argmax(Q, dim=1)
        return action_index.item()


class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()




In [10]:

onlineQNetwork = DQN().to(device)
targetQNetwork = DQN().to(device)
targetQNetwork.load_state_dict(onlineQNetwork.state_dict())

optimizer = torch.optim.Adam(onlineQNetwork.parameters(), lr=1e-4)

GAMMA = 0.99
EPSILON_DECAY = 0.999
INITIAL_EPSILON = 1
FINAL_EPSILON = 0.001
REPLAY_MEMORY = 1000
BATCH = 16
UPDATE_STEPS = 4

memory_replay = Memory(REPLAY_MEMORY)
epsilon = INITIAL_EPSILON
learn_steps = 0
writer = SummaryWriter('logs/dqn')
begin_learn = False
epochs = 1000

# for epoch in count():
for epoch in range(epochs):
    state = env.reset()
    
    episode_reward = 0
    for time_steps in range(200):
        p = random.random()
        if p < epsilon:
            action = random.randint(0, 1)
        else:
            tensor_state = state.unsqueeze(0).to(device)
#             print(tensor_state)
            action = onlineQNetwork.select_action(tensor_state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        memory_replay.add((state, next_state, action, reward, done))
        if memory_replay.size() >= REPLAY_MEMORY:
            if begin_learn is False:
                print('learn begin!')
                begin_learn = True
            learn_steps += 1
            if learn_steps % UPDATE_STEPS == 0:
                targetQNetwork.load_state_dict(onlineQNetwork.state_dict())
            batch = memory_replay.sample(BATCH, True)
            batch_state, batch_next_state, batch_action, batch_reward, batch_done = zip(*batch)

            batch_state = torch.stack(batch_state).to(device)
            batch_next_state = torch.stack(batch_next_state).to(device)
            batch_action = torch.FloatTensor(batch_action).unsqueeze(1).to(device)
            batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(device)
            batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(device)

            with torch.no_grad():
                targetQ_next = targetQNetwork(batch_next_state)
                y = batch_reward + (1 - batch_done) * GAMMA * torch.max(targetQ_next, dim=1, keepdim=True)[0]

            loss = F.mse_loss(onlineQNetwork(batch_state).gather(1, batch_action.long()), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            writer.add_scalar('loss', loss.item(), global_step=learn_steps)

            if epsilon > FINAL_EPSILON:
                epsilon *= EPSILON_DECAY

        if done:
            break
        state = next_state

    writer.add_scalar('episode reward', episode_reward, global_step=epoch)
    if epoch % 10 == 0:
        torch.save(onlineQNetwork.state_dict(), 'dqn-policy.para')
        print('Ep {}\tMoving average score: {:.2f}\t'.format(epoch, episode_reward))

Ep 0	Moving average score: 7.40	
Ep 10	Moving average score: 5.80	
Ep 20	Moving average score: 1.60	
Ep 30	Moving average score: 10.30	
Ep 40	Moving average score: 8.50	
learn begin!
Ep 50	Moving average score: 3.90	
Ep 60	Moving average score: 8.50	
Ep 70	Moving average score: 8.90	
Ep 80	Moving average score: 7.40	
Ep 90	Moving average score: 10.40	
Ep 100	Moving average score: 2.60	
Ep 110	Moving average score: 10.30	
Ep 120	Moving average score: 13.30	
Ep 130	Moving average score: 10.60	
Ep 140	Moving average score: 13.50	
Ep 150	Moving average score: 10.60	
Ep 160	Moving average score: 1.30	
Ep 170	Moving average score: 12.00	
Ep 180	Moving average score: 13.30	
Ep 190	Moving average score: 14.20	
Ep 200	Moving average score: 17.30	
Ep 210	Moving average score: 9.50	
Ep 220	Moving average score: 10.60	
Ep 230	Moving average score: 2.60	
Ep 240	Moving average score: 16.20	
Ep 250	Moving average score: 7.30	
Ep 260	Moving average score: 16.20	
Ep 270	Moving average score: 11.30	
Ep 