# Imports

In [1]:
from dqn.dqn import QNetwork, ReplayMemory
from policies.eps_greedy import EpsilonGreedyPolicy
from train_eval.train import train

import random
import numpy as np
import torch
from torch import optim
import gym
import matplotlib.pyplot as plt

# Load the environment

In [2]:
env = gym.envs.make("CartPole-v1")

  result = entry_point.load(False)


# Create DQN and policy

In [3]:
seed = 42  # This is not randomly chosen
num_hidden = 128
eps = 0.05

# We will seed the algorithm (before initializing QNetwork!) for reproducibility
random.seed(seed)
torch.manual_seed(seed)
env.seed(seed)

Q_net = QNetwork(num_hidden=num_hidden)
policy = EpsilonGreedyPolicy(Q_net, eps)

# Function to run episodes and call training function (with replay memory)

In [4]:
def run_episodes(train, Q, policy, memory, env, num_episodes, batch_size, discount_factor, optimizer):
    global_steps = 0  # Count the steps (do not reset at episode start, to compute epsilon)
    episode_durations = []  #
    for i in range(num_episodes):
        state = env.reset()

        steps = 0
        while True:
            # Set epsilon according to number of steps
            policy.set_epsilon(global_steps)
            # Sample an action, next state, reward and done
            a = policy.sample_action(state)
            s_next, r, done, _ = env.step(a)
            # Add the transition to the memory buffer
            memory.push((state, a, r, s_next, done))
            # Perform training on the buffer
            loss = train(Q, memory, optimizer, batch_size, discount_factor)
            # Increase step counts and set current state
            steps += 1
            global_steps += 1
            state = s_next

            if done:
                if i % 10 == 0:
                    print("{2} Episode {0} finished after {1} steps"
                          .format(i, steps, '\033[92m' if steps >= 195 else '\033[99m'))
                episode_durations.append(steps)
                break
    return episode_durations

# Perform training

In [5]:
num_episodes = 100
batch_size = 64
discount_factor = 0.8
learn_rate = 1e-3
# To switch off Replay Memory mechanism, simply set size to batch size
memory = ReplayMemory(10000)

optimizer = optim.Adam(Q_net.parameters(), learn_rate)
episode_durations = run_episodes(train, Q_net, policy, memory, env, num_episodes, batch_size, discount_factor, optimizer)

 Episode 0 finished after 8 steps
 Episode 10 finished after 22 steps
 Episode 20 finished after 11 steps
 Episode 30 finished after 15 steps
 Episode 40 finished after 17 steps
 Episode 50 finished after 66 steps
 Episode 60 finished after 169 steps
 Episode 70 finished after 129 steps
 Episode 80 finished after 122 steps
 Episode 90 finished after 130 steps


# Function to run episodes and call training function (without replay memory)

In [6]:
def run_episodes(train, Q, policy, memory, env, num_episodes, batch_size, discount_factor, optimizer):
    global_steps = 0  # Count the steps (do not reset at episode start, to compute epsilon)
    episode_durations = []  #
    for i in range(num_episodes):
        state = env.reset()

        steps = 0
        while True:
            # Set epsilon according to number of steps
            policy.set_epsilon(global_steps)
            # Increase step counts
            steps += 1
            global_steps += 1
            # Sample an action, next state, reward and done
            a = policy.sample_action(state)
            s_next, r, done, _ = env.step(a)
            # Add the transition to the memory buffer
            memory.push((state, a, r, s_next, done))
            # Perform training on the buffer
            if global_steps % batch_size == 0:
                loss = train(Q, memory, optimizer, batch_size, discount_factor)
            state = s_next

            if done:
                if i % 500 == 0:
                    print("{2} Episode {0} finished after {1} steps"
                          .format(i, steps, '\033[92m' if steps >= 195 else '\033[99m'))
                episode_durations.append(steps)
                break
    return episode_durations

In [7]:
random.seed(seed)
torch.manual_seed(seed)
env.seed(seed)

Q_net = QNetwork(num_hidden=num_hidden)
policy = EpsilonGreedyPolicy(Q_net, eps)

num_episodes = 10000
batch_size = 32
discount_factor = 0.8
learn_rate = 1e-3
memory = ReplayMemory(batch_size)

optimizer = optim.Adam(Q_net.parameters(), learn_rate)
episode_durations = run_episodes(train, Q_net, policy, memory, env, num_episodes, batch_size, discount_factor, optimizer)

 Episode 0 finished after 8 steps
 Episode 500 finished after 10 steps
 Episode 1000 finished after 9 steps
 Episode 1500 finished after 12 steps
 Episode 2000 finished after 9 steps
 Episode 2500 finished after 8 steps
 Episode 3000 finished after 9 steps
 Episode 3500 finished after 9 steps
 Episode 4000 finished after 220 steps
 Episode 4500 finished after 10 steps
 Episode 5000 finished after 174 steps
 Episode 5500 finished after 158 steps
 Episode 6000 finished after 161 steps
 Episode 6500 finished after 194 steps
 Episode 7000 finished after 170 steps
 Episode 7500 finished after 11 steps
 Episode 8000 finished after 31 steps
 Episode 8500 finished after 102 steps
 Episode 9000 finished after 13 steps
 Episode 9500 finished after 125 steps
