<a href="https://colab.research.google.com/github/JHyunjun/SNU/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-network Practice

If you run in jupyter, turn 

```
colab = False
```

In [1]:
colab = True
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setuptools
  Downloading setuptools-65.2.0-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.1 MB/s 
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.9.0 requires jedi>=0.10, which is not installed.[0m
Successfully installed setuptools-65.2.0


In [4]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/Colab Notebooks/snu/8주_RL/강의자료/실습강의/day2/dqn
    !ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/snu/8주_RL/강의자료/실습강의/day2/dqn
buffer.py     dqn.ipynb        plot.ipynb   setup.ipynb  utils.py
day2_dqn.pdf  learning_curves  schedule.py  snapshots


# -1. Introduction to Gym environment

## -1.1 Prerequisites

# 0. Define Q-network & policy-network

In [6]:
import torch
import torch.nn as nn
from torch.nn import MSELoss
import torch.nn.functional as F
import copy
import os
import csv
import numpy as np
import torch
from torch.optim import Adam
from buffer import ReplayBuffer
from utils import save_snapshot, recover_snapshot, load_model
from schedule import LinearSchedule
import gym

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device =', device)

current device = cuda


In [8]:
# critic network definition
# multi-layer perceptron (with 2 hidden layers)
class Critic(nn.Module):
    def __init__(self, state_dim, num_action, hidden_size1, hidden_size2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_action)


    def forward(self, state):
        # given a state s, the network returns a vector Q(s,) of length |A|
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        q = self.fc3(x)
        return q

# 1. Define DQN agent

In [13]:
class DQNAgent:
    def __init__(self, obs_dim, num_act, hidden1, hidden2):
        self.obs_dim = obs_dim
        self.num_act = num_act
        # networks
        self.critic = Critic(obs_dim, num_act, hidden1, hidden2).to(device)
                
    def act(self, state, epsilon=0.0):
        # simple implementation of \epsilon-greedy method
        # TODO : Complete epsilon-greedy action selection
        # Hint : np.randon.rand() will generate random number in [0,1]
        if np.random.rand() : 
            return np.random.randint(self.num_act)
        else :
            # greedy selection
            self.critic.eval()
            s = torch.Tensor(state).view(1, self.obs_dim).to(device)
            q = self.critic(s)
            return np.argmax(q.cpu().detach().numpy())

# 2. Implement one-step param update

In [14]:
def update(agent, replay_buf, gamma, critic_optim, target_critic, tau, batch_size):
    # agent : agent with networks to be trained
    # replay_buf : replay buf from which we sample a batch
    # actor_optim / critic_optim : torch optimizers
    # tau : parameter for soft target update
    
    agent.critic.train()

    batch = replay_buf.sample_batch(batch_size)

    # unroll batch
    with torch.no_grad():
        observations = torch.Tensor(batch['state']).to(device)
        actions = torch.tensor(batch['action'], dtype=torch.long).to(device)
        rewards = torch.Tensor(batch['reward']).to(device)
        next_observations = torch.Tensor(batch['next_state']).to(device)
        terminals = torch.Tensor(batch['done']).to(device)

        mask = 1.0 - terminals
        ### double DQN? ###
        # a_inner = torch.unsqueeze(torch.max(agent.critic(next_observations), 1)[1], 1).detach()
        # next_q_double = target_critic(observations).gather(1, a_inner)
        # next_q_double = mask * next_q_double
        ###################
        next_q = torch.unsqueeze(target_critic(next_observations).max(1)[0], 1)
        next_q = mask * next_q
        
        # TODO : Build Bellman target for Q-update
        target = rewards + gamma * next_q 

    out = agent.critic(observations).gather(1, actions)

    loss_ftn = MSELoss()
    loss = loss_ftn(out, target)

    critic_optim.zero_grad()
    loss.backward()
    critic_optim.step()
        
    # soft target update (both actor & critic network)
    for p, targ_p in zip(agent.critic.parameters(), target_critic.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
        
    return

In [15]:
def evaluate(agent, env, num_episodes=5):

    sum_scores = 0.
    
    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.
        
        while not done:
            action = agent.act(obs)
            obs, rew, done, _ = env.step(action)
            score += rew
        sum_scores += score
    avg_score = sum_scores / num_episodes
    
    return avg_score

# 3. Combining these, we finally have...

In [16]:
def train(agent, env, gamma, 
          lr, tau,
          ep_len, num_updates, batch_size,
          init_buffer=5000, buffer_size=100000,
          start_train=2000, train_interval=50,
          eval_interval=2000, snapshot_interval=10000,
          path=None):
    
    target_critic = copy.deepcopy(agent.critic)
    
    # environment for evaluation
    test_env = copy.deepcopy(env)
    # freeze target network
    for p in target_critic.parameters():
        p.requires_grad_(False)

    critic_optim = Adam(agent.critic.parameters(), lr=lr)

    if path is not None:
        recover_snapshot(path, agent.critic,
                         target_critic, critic_optim,
                         device=device
                        )
        # load snapshot
    
    obs_dim = env.observation_space.shape[0]
    num_act = env.action_space.n
    
    replay_buf = ReplayBuffer(obs_dim, buffer_size)
    
    max_epsilon = 1.
    min_epsilon = 0.02
    exploration_schedule = LinearSchedule(begin_t=start_train,
                                          end_t=num_updates,
                                          begin_value=max_epsilon,
                                          end_value=min_epsilon
                                         )
    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)
    os.makedirs('./learning_curves/', exist_ok=True)
    log_file = open('./learning_curves/res.csv',
                    'w',
                    encoding='utf-8',
                    newline=''
                   )
    logger = csv.writer(log_file)
    
    # main loop
    obs = env.reset()
    done = False
    step_count = 0
    
    for t in range(num_updates + 1):
        if t < init_buffer:
            # perform random action until we collect sufficiently many samples
            # this is for exploration purpose
            action = env.action_space.sample()
        else:
            # executes epsilon-greedy action
            epsilon = exploration_schedule(t)
            action = agent.act(obs, epsilon=epsilon)
            
        next_obs, rew, done, _ = env.step(action)
        step_count += 1
        if step_count == ep_len:
            # if the next_state is not terminal but done is set to True by gym env wrapper
            done = False
            
        replay_buf.append(obs, action, next_obs, rew, done)
        obs = next_obs
        
        if done == True or step_count == ep_len:
            # reset environment if current environment reaches a terminal state 
            # or step count reaches predefined length
            obs = env.reset()
            done = False
            step_count = 0
            # score = evaluate(agent, env)
            # print('[iteration {}] evaluation score : {}'.format(t, score))
        
        if t % eval_interval == 0:
            avg_score = evaluate(agent, test_env, num_episodes=5)
            print('[iter {}] average score = {} (over 5 episodes)'.format(t, avg_score))
            evaluation_log = [t, avg_score]
            logger.writerow(evaluation_log)
        
        if t % snapshot_interval == 0:
            snapshot_path = save_path + 'iter{}_'.format(t)
            # save weight & training progress
            save_snapshot(snapshot_path, agent.critic, target_critic, critic_optim)
        
        if t > start_train and t % train_interval == 0:
            # start training after fixed number of steps
            # this may mitigate overfitting of networks to the 
            # small number of samples collected during the initial stage of training
            for _ in range(train_interval):
                update(agent,
                       replay_buf,
                       gamma,
                       critic_optim,
                       target_critic,
                       tau,
                       batch_size
                      )

    log_file.close()

# 4. Let's train our agent!

In [17]:
env = gym.make('CartPole-v1')
obs_dim = env.observation_space.shape[0]
num_act = env.action_space.n

print('observation space dim. : {} / # actions : {}'.format(obs_dim, num_act))

observation space dim. : 4 / # actions : 2


  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [18]:
if colab:
    import gym
    from gym.wrappers import Monitor
    import glob
    import io
    import base64
    from IPython.display import HTML
    from pyvirtualdisplay import Display
    from IPython import display as ipythondisplay

    display = Display(visible=0, size=(1400, 900))
    display.start()

    def show_video():
      mp4list = glob.glob('video/*.mp4')
      if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
      else: 
        print("Could not find video")
        

    def wrap_env(env):
      env = Monitor(env, './video', force=True)
      return env

    env = wrap_env(env)

ImportError: ignored

In [None]:
agent = DQNAgent(obs_dim=obs_dim, num_act=num_act, hidden1=256, hidden2=256)

In [None]:
gamma = 0.99
lr = 1e-3
tau = 1e-3
ep_len = 500
num_updates = 100000
batch_size = 128

In [None]:
train(agent, env, gamma, 
      lr, tau,
      ep_len, num_updates, batch_size,
      init_buffer=5000, buffer_size=100000,
      start_train=2000, train_interval=50,
      eval_interval=2000, snapshot_interval=2000, path=None)

# 5. Watch the trained agent!

In [None]:
env = gym.make('CartPole-v1')
if colab:
  env = wrap_env(env)
obs = env.reset()
done = False
score = 0.
load_model(agent, path='./snapshots/trained.pth.tar', device=device)
while not done:
    env.render()
    obs, rew, done, _ = env.step(agent.act(obs))
    score += rew
    
env.close()
print('score : ', score)

if colab:
    show_video()