# Lab 4. 利用Deep Q-Network訓練

大家好。歡迎各位再度回來增強式學習的世界
我們要利用深度網路來作為Q Function，達到比用Q-Table更好的效果

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym


# Cheating mode speeds up the training process
CHEAT = True


#建立一層隱藏層，將state傳入後，得出每個action分數
#分數越高的action，越有機會被選中
#我們的目標是在當前state下，對未來越有利的action分數越高
class Net(nn.Module):
    def __init__(self, n_states, n_actions, n_hidden):
        super(Net, self).__init__()

        #輸入層(state)到隱藏層，隱藏層到輸出層(action)
        self.fc1 = nn.Linear(n_states, n_hidden)
        self.out = nn.Linear(n_hidden, n_actions)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        actions_value = self.out(x)
        return actions_value


#Deep Q-Network是由兩個network組成的，evaluation network(eval_net)與target network(target_net)
#還需要memory儲存experience與設定好的參數
class DQN(object):
    def __init__(self, n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity):
        self.eval_net, self.target_net = Net(n_states, n_actions, n_hidden), Net(n_states, n_actions, n_hidden)

        self.memory = np.zeros((memory_capacity, n_states * 2 + 2)) # initialize memory, each memory slot is of size (state + next state + reward + action)
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=lr)
        self.loss_func = nn.MSELoss()
        self.memory_counter = 0
        self.learn_step_counter = 0 # for target network update

        self.n_states = n_states
        self.n_actions = n_actions
        self.n_hidden = n_hidden
        self.batch_size = batch_size
        self.lr = lr
        self.epsilon = epsilon
        self.gamma = gamma
        self.target_replace_iter = target_replace_iter
        self.memory_capacity = memory_capacity

    #會根據ε-greedy policy選擇action
    #ε會讓action隨機亂走，如此才有機會可以學到新經驗
    def choose_action(self, state):
        x = torch.unsqueeze(torch.FloatTensor(state), 0)

        # epsilon-greedy
        if np.random.uniform() < self.epsilon: # random
            action = np.random.randint(0, self.n_actions)
        else: # greedy
            actions_value = self.eval_net(x) # feed into eval net, get scores for each action
            action = torch.max(actions_value, 1)[1].data.numpy()[0] # choose the one with the largest score

        return action
    
    #儲存experience
    def store_transition(self, state, action, reward, next_state):
        # Pack the experience
        transition = np.hstack((state, [action, reward], next_state))

        # Replace the old memory with new memory
        index = self.memory_counter % self.memory_capacity
        self.memory[index, :] = transition
        self.memory_counter += 1

    #從memory中取樣學習
    def learn(self):
        # Randomly select a batch of memory to learn from
        sample_index = np.random.choice(self.memory_capacity, self.batch_size)
        b_memory = self.memory[sample_index, :]
        b_state = torch.FloatTensor(b_memory[:, :self.n_states])
        b_action = torch.LongTensor(b_memory[:, self.n_states:self.n_states+1].astype(int))
        b_reward = torch.FloatTensor(b_memory[:, self.n_states+1:self.n_states+2])
        b_next_state = torch.FloatTensor(b_memory[:, -self.n_states:])

        #計算現有 eval net 和 target net 得出 Q value 的落差
        q_eval = self.eval_net(b_state).gather(1, b_action) # evaluate the Q values of the experiences, given the states & actions taken at that time
        q_next = self.target_net(b_next_state).detach() # detach from graph, don't backpropagate
        q_target = b_reward + self.gamma * q_next.max(1)[0].view(self.batch_size, 1) # compute the target Q values
        loss = self.loss_func(q_eval, q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        #每隔一段時間 (target_replace_iter), 更新 target net，即複製 eval net 到 target net
        self.learn_step_counter += 1
        if self.learn_step_counter % self.target_replace_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env = env.unwrapped # For cheating mode to access values hidden in the environment

    # Environment parameters
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]

    # Hyper parameters
    n_hidden = 50
    batch_size = 32
    lr = 0.01                 # learning rate
    epsilon = 0.1             # epsilon-greedy, factor to explore randomly
    gamma = 0.9               # reward discount factor
    target_replace_iter = 100 # target network update frequency
    memory_capacity = 2000
    n_episodes = 400 if CHEAT else 4000

    # Create DQN
    dqn = DQN(n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity)

    #學習
    for i_episode in range(n_episodes):
        t = 0 # timestep
        rewards = 0 # accumulate rewards for each episode
        state = env.reset() # reset environment to initial state for each episode
        while True:
            env.render()

            # Agent takes action
            action = dqn.choose_action(state) # choose an action based on DQN
            next_state, reward, done, info = env.step(action) # do the action, get the reward

            # Cheating part: modify the reward to speed up training process
            if CHEAT:
                x, v, theta, omega = next_state
                r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 # reward 1: the closer the cart is to the center, the better
                r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 # reward 2: the closer the pole is to the center, the better
                reward = r1 + r2

            # Keep the experience in memory
            dqn.store_transition(state, action, reward, next_state)

            # Accumulate reward
            rewards += reward

            #有足夠 experience 後進行訓練
            if dqn.memory_counter > memory_capacity:
                dqn.learn()

            # 進入下一 state
            state = next_state

            if done:
                print('Episode finished after {} timesteps, total rewards {}'.format(t+1, rewards))
                break

            t += 1

    env.close()

Episode finished after 9 timesteps, total rewards 1.9419941711389472
Episode finished after 11 timesteps, total rewards 2.5835927210823257
Episode finished after 9 timesteps, total rewards 1.482335843574942
Episode finished after 9 timesteps, total rewards 2.687485169310189
Episode finished after 9 timesteps, total rewards 2.493883846415969
Episode finished after 9 timesteps, total rewards 1.8961193465113673
Episode finished after 11 timesteps, total rewards 3.2720242916565696
Episode finished after 9 timesteps, total rewards 2.364456480107988
Episode finished after 9 timesteps, total rewards 2.4771731853576977
Episode finished after 9 timesteps, total rewards 0.851510939436956
Episode finished after 9 timesteps, total rewards 2.463685634561482
Episode finished after 8 timesteps, total rewards 1.0378496893671727
Episode finished after 10 timesteps, total rewards 2.678929347345922
Episode finished after 9 timesteps, total rewards 1.4393878337492976
Episode finished after 8 timesteps, to

Episode finished after 8 timesteps, total rewards 1.520824473193941
Episode finished after 10 timesteps, total rewards 1.4364681849368632
Episode finished after 10 timesteps, total rewards 2.5430242378530843
Episode finished after 8 timesteps, total rewards 1.3213255629983225
Episode finished after 9 timesteps, total rewards 1.9564414420496958
Episode finished after 10 timesteps, total rewards 2.4125580073767035
Episode finished after 9 timesteps, total rewards 2.556135897527449
Episode finished after 9 timesteps, total rewards 1.974190605253119
Episode finished after 10 timesteps, total rewards 2.3666037065357726
Episode finished after 10 timesteps, total rewards 1.688945122504824
Episode finished after 10 timesteps, total rewards 2.2856231208706292
Episode finished after 10 timesteps, total rewards 3.2093664545711102
Episode finished after 10 timesteps, total rewards 3.1565980870741295
Episode finished after 9 timesteps, total rewards 2.526962167562107
Episode finished after 9 timest

Episode finished after 782 timesteps, total rewards 162.67979440305524
Episode finished after 2972 timesteps, total rewards 1464.2823172597798
Episode finished after 1057 timesteps, total rewards 351.49546139485966
Episode finished after 1012 timesteps, total rewards 299.0294929957362
Episode finished after 1060 timesteps, total rewards 127.13701342844016
Episode finished after 934 timesteps, total rewards 350.3905045979366
Episode finished after 1085 timesteps, total rewards 227.75107045908314
Episode finished after 1554 timesteps, total rewards 562.6292338019906
Episode finished after 1860 timesteps, total rewards 328.39429274378597
Episode finished after 1553 timesteps, total rewards 607.6517513755065
Episode finished after 459 timesteps, total rewards 152.2203103565231
Episode finished after 1368 timesteps, total rewards 458.30913143135587
Episode finished after 1443 timesteps, total rewards 396.0678795105274
Episode finished after 1014 timesteps, total rewards 439.147793209998
Epi