In [1]:
import utils
from RL2 import Actor, Critic, A2CAgent
from Environment import GridWorld
import torch
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

In [2]:
device = utils.get_device()
print("device: ", device)

device:  cuda


In [3]:
buffer_size=1000
grid_world = GridWorld(grid_size=(5, 5), POI_density=0.2)
agent = A2CAgent(learning_rate_actor=0.001, learning_rate_critic=0.01, buffer_size=1000, n_actions=4, hidden_channels=128, device=device)

In [4]:
num_episodes = 1000
batch_size = 32
discount_factor = 0.99

In [5]:
def train_off_policy_agent(env, num_episodes, minimal_size, batch_size):
    return_list = []
    for i in range(10):
        with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
            for i_episode in range(int(num_episodes/10)):
                episode_return = 0
                state = env.reset()
                done = False
                while not done:
                    action = agent.actor.choose_action(state,device)
                    next_state, reward, done, _ = env.step(action)
                    agent.remember(state, action, reward, next_state, done)
                    state = next_state
                    episode_return += reward
                    if agent.buffer_size() > minimal_size:
                        b_s, b_a, b_r, b_ns, b_d = agent.sample_from_buffer(batch_size)
                        transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
                        agent.update(transition_dict)
                return_list.append(episode_return)
                if (i_episode+1) % 10 == 0:
                    pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
                pbar.update(1)
    return return_list

In [6]:
def train_on_policy_agent(env, num_episodes):
    return_list = []
    for i in range(10):
        with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
            for i_episode in range(int(num_episodes/10)):
                episode_return = 0
                transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []}
                state = env.reset()
                done = False
                while not done:
                    action = agent.actor.choose_action(state,device)
                    next_state, reward, done, _ = env.step(action)
                    transition_dict['states'].append(state)
                    transition_dict['actions'].append(action)
                    transition_dict['next_states'].append(next_state)
                    transition_dict['rewards'].append(reward)
                    transition_dict['dones'].append(done)
                    state = next_state
                    episode_return += reward
                return_list.append(episode_return)
                agent.update(transition_dict)
                if (i_episode+1) % 10 == 0:
                    pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:]), 'max' : '%.3f' % env.theoretical_Maximum()})
                pbar.update(1)
    return return_list

In [7]:
return_list = train_off_policy_agent(grid_world, num_episodes,100,64)

Iteration 0: 100%|██████████| 100/100 [03:21<00:00,  2.02s/it, episode=100, return=27.958]
Iteration 1: 100%|██████████| 100/100 [03:42<00:00,  2.22s/it, episode=200, return=27.926]
Iteration 2: 100%|██████████| 100/100 [04:03<00:00,  2.44s/it, episode=300, return=26.954]
Iteration 3: 100%|██████████| 100/100 [03:59<00:00,  2.39s/it, episode=400, return=26.956]
Iteration 4:  74%|███████▍  | 74/100 [1:31:40<32:12, 74.34s/it, episode=470, return=20.820]   


KeyboardInterrupt: 

In [None]:
def moving_average(a, window_size):
    cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
    middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
    r = np.arange(1, window_size-1, 2)
    begin = np.cumsum(a[:window_size-1])[::2] / r
    end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
    return np.concatenate((begin, middle, end))

In [None]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')

plt.show()

mv_return = moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')

plt.show()

NameError: name 'return_list' is not defined