In [1]:
"""
This code includes modifications from the spinningup repository:
https://github.com/openai/spinningup

Project Title: Vanilla Policy Gradient (VPG) for CartPole-v0

File Name: VPG_cartpole_01_train_save_load.py

Author: Maciej_M

Date: 2023-04-07

Project Description:
This project is an implementation of the Vanilla Policy Gradient (VPG) algorithm to solve the CartPole-v0 environment from OpenAI Gym. The VPG algorithm is a foundational reinforcement learning method that uses policy gradients to update an agent's policy directly. The agent is represented by a neural network with a customizable number of hidden layers and hidden units per layer.

Main Components:

1.mlp function: Creates a multilayer perceptron (MLP) with specified layer sizes and activation functions.
2.train function: Trains an agent in the CartPole-v0 environment using the VPG algorithm.
3.save_agent function: Saves the trained agent to a file.
4.load_agent and load_latest_agent functions: Load a saved agent from a file.
5.evaluate_agent function: Evaluates the performance of a trained agent in the CartPole-v0 environment.
6.save_plot function: Plots mean average episodes per epoch during training, or in evaluation run.
7.Training loop in the if __name__ == '__main__' block: Trains multiple agents with randomly generated hidden layer sizes and numbers of hidden layers, and saves them.
8.parse_config, create_logits_net, get_agent_results, and save_best_configs_to_csv functions: These functions are used to test the saved agents, plot their evaluation results, and save the best-performing configurations to a file.

Results and best-performing agents can be viewed in results.csv and best_configs.csv.

Note: This is a copy of a Jupyter Notebook. 
"""

"\nThis code includes modifications from the spinningup repository:\nhttps://github.com/openai/spinningup\n\nProject Title: Vanilla Policy Gradient (VPG) for CartPole-v0\n\nFile Name: VPG_cartpole_01_train_save_load.py\n\nAuthor: Maciej_M\n\nDate: 2023-04-07\n\nProject Description:\nThis project is an implementation of the Vanilla Policy Gradient (VPG) algorithm to solve the CartPole-v0 environment from OpenAI Gym. The VPG algorithm is a foundational reinforcement learning method that uses policy gradients to update an agent's policy directly. The agent is represented by a neural network with a customizable number of hidden layers and hidden units per layer.\n\nMain Components:\n\n1.mlp function: Creates a multilayer perceptron (MLP) with specified layer sizes and activation functions.\n2.train function: Trains an agent in the CartPole-v0 environment using the VPG algorithm.\n3.save_agent function: Saves the trained agent to a file.\n4.load_agent and load_latest_agent functions: Load a

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import os
import time
import imageio
import gym
from gym.spaces import Discrete, Box
from pyvirtualdisplay import Display


In [3]:
#sizes is dimenion, that is number of vectors in each layer
#Then we have a computing function that transforms from each layer to each layer, which is activation
#The last operation from one by last layer to the last is an identity.

def mlp(sizes, activation=nn.Tanh , output_activation=nn.Identity):
    # Build a feedforward neural network.
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

In [4]:
import matplotlib.pyplot as plt
import os

def save_plot(ep_lengths, filename):
    plt.figure()
    plt.plot(ep_lengths)
    plt.xlabel('Epoch')
    plt.ylabel('Episode Length')
    plt.title('Episode Length over Time')
    
    plt.savefig(filename)
    plt.close()

In [5]:
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, hidden_layers_number=1,
          epochs=50, batch_size=5000, render=False):
    
    
    # make environment, check spaces, get obs / act dims
    env = gym.make(env_name)
    assert isinstance(env.observation_space, Box),         "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete),         "This example only works for envs with discrete action spaces."

    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n

    # make core of policy network
    logits_net = mlp(sizes=[obs_dim] + hidden_sizes * hidden_layers_number + [n_acts])

    # make function to compute action distribution
    def get_policy(obs):
        logits = logits_net(obs)
        return Categorical(logits=logits)

    # make action selection function (outputs int actions, sampled from policy)
    def get_action(obs):
        return get_policy(obs).sample().item()

    # make loss function whose gradient, for the right data, is policy gradient
    def compute_loss(obs, act, weights):
        logp = get_policy(obs).log_prob(act)
        return -(logp * weights).mean()
    # make optimizer
    optimizer = torch.optim.Adam(logits_net.parameters(), lr=lr)

    # for training policy
    def train_one_epoch():
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient
        batch_rets = []         # for measuring episode returns
        batch_lens = []         # for measuring episode lengths

        # reset episode-specific variables
        obs = env.reset()       # first obs comes from starting distribution
        done = False            # signal from environment that episode is over
        ep_rews = []            # list for rewards accrued throughout ep

        # render first episode of each epoch
        finished_rendering_this_epoch = False

        # collect experience by acting in the environment with current policy
        while True:

            # rendering
            if (not finished_rendering_this_epoch) and render:
                env.render()

            # save obs
            batch_obs.append(obs.copy())

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            obs, rew, done, _ = env.step(act)

            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew)

            if done:
                # if episode is over, record info about episode
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)

                # the weight for each logprob(a|s) is R(tau)
                batch_weights += [ep_ret] * ep_len

                # reset episode-specific variables
                obs, done, ep_rews = env.reset(), False, []

                # won't render again this epoch
                finished_rendering_this_epoch = True

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.int32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                                  )
        batch_loss.backward()
        optimizer.step()
        return batch_loss, batch_rets, batch_lens

    # training loop
    ep_lengths_over_time = [] 
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch()
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
                (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
        ep_lengths_over_time.append(np.mean(batch_lens))
    
    #save_agent(logits_net,ep_lengths_over_time)
    return logits_net, ep_lengths_over_time
    

In [6]:
def save_agent(agent, hidden_layers_number, hidden_size, ep_lengths_over_time=None, folder='trained_agents'):
    folder_name = f"VPG_Pytorch_l{hidden_layers_number}_s{hidden_size}"
    save_folder = os.path.join(folder, folder_name)
    
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    date_str = time.strftime("%Y%m%d-%H%M%S")
    filename = f'agent_{date_str}.pt'

    filepath = os.path.join(save_folder, filename)
    torch.save(agent.state_dict(), filepath)
    print(f"Agent saved to {filepath}")

    if ep_lengths_over_time is not None:
        plot_filepath = filepath.replace('.pt', '_training_plot.png')
        save_plot(ep_lengths_over_time, plot_filepath)

    return filepath


In [7]:
def load_agent(logits_net, filepath):
    logits_net.load_state_dict(torch.load(filepath))
    logits_net.eval()
    print(f"Loaded agent from {filepath}")
    return logits_net


In [8]:
import os
import glob

def load_latest_agent(logits_net, folder='trained_agents'):
    # Find the latest agent file based on the timestamp in the filename
    list_of_files = glob.glob(os.path.join(folder, 'agent_*.pt'))
    latest_file = max(list_of_files, key=os.path.getctime)

    # Load the agent
    logits_net.load_state_dict(torch.load(latest_file))
    logits_net.eval()
    print(f"Loaded agent from {latest_file}")
    return logits_net

In [9]:
def evaluate_agent(env, logits_net, num_episodes=50, verbose=True, render=True):
    def get_action(obs):
        return Categorical(logits=logits_net(torch.as_tensor(obs, dtype=torch.float32).unsqueeze(0))).sample().item()

    total_rewards = []
    ep_lengths = []

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        ep_length = 0
        while not done:
            if render:
                env.render()
            action = get_action(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            ep_length += 1

        total_rewards.append(total_reward)
        ep_lengths.append(ep_length)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}, Episode Length = {ep_length}")

    avg_reward = np.mean(total_rewards)
    avg_ep_len = np.mean(ep_lengths)
    if verbose:
        print(f"Average Return: {avg_reward:.3f} \t Average Episode Length: {avg_ep_len:.3f}")
    env.close()
    return avg_reward, avg_ep_len,ep_lengths

In [10]:
class Args:
    env_name = 'CartPole-v0'
    render = False
    lr = 1e-2
    hidden_sizes = [32]
    hidden_layers_number = 1
    #hidden_sizes = [32]*hidden_layers_number

In [11]:
if __name__ == '__main__':
    args = Args()
    print('\nUsing simplest formulation of policy gradient.\n')

    # Randomly test 10 hidden layer sizes around 32 and number from 1 to 3
    for _ in range(10):
        hidden_size = np.random.randint(32 - 10, 32 + 10)  # Random size around 32
        hidden_layers_number = np.random.randint(1, 4)  # Random number of layers from 1 to 3
        args.hidden_sizes = [hidden_size]
        args.hidden_layers_number = hidden_layers_number

        print(f"Training with {hidden_layers_number} hidden layers and hidden size {hidden_size}")

        # Train the agent
        logits_net, ep_lengths_over_time = train(env_name=args.env_name, hidden_sizes=args.hidden_sizes,
                           lr=args.lr, hidden_layers_number=args.hidden_layers_number,
                           render=args.render)

        # Save the agent and the training plot
        save_agent(logits_net, args.hidden_layers_number, hidden_size, ep_lengths_over_time)



Using simplest formulation of policy gradient.

Training with 3 hidden layers and hidden size 26




epoch:   0 	 loss: 18.828 	 return: 21.616 	 ep_len: 21.616
epoch:   1 	 loss: 22.225 	 return: 23.947 	 ep_len: 23.947
epoch:   2 	 loss: 27.247 	 return: 28.101 	 ep_len: 28.101
epoch:   3 	 loss: 26.574 	 return: 29.976 	 ep_len: 29.976
epoch:   4 	 loss: 36.243 	 return: 40.772 	 ep_len: 40.772
epoch:   5 	 loss: 38.209 	 return: 48.248 	 ep_len: 48.248
epoch:   6 	 loss: 41.939 	 return: 51.255 	 ep_len: 51.255
epoch:   7 	 loss: 39.079 	 return: 53.737 	 ep_len: 53.737
epoch:   8 	 loss: 45.163 	 return: 67.067 	 ep_len: 67.067
epoch:   9 	 loss: 42.228 	 return: 66.867 	 ep_len: 66.867
epoch:  10 	 loss: 36.571 	 return: 62.837 	 ep_len: 62.837
epoch:  11 	 loss: 35.107 	 return: 64.333 	 ep_len: 64.333
epoch:  12 	 loss: 32.143 	 return: 62.543 	 ep_len: 62.543
epoch:  13 	 loss: 33.283 	 return: 66.117 	 ep_len: 66.117
epoch:  14 	 loss: 29.495 	 return: 63.937 	 ep_len: 63.937
epoch:  15 	 loss: 30.963 	 return: 67.067 	 ep_len: 67.067
epoch:  16 	 loss: 25.647 	 return: 62.0

epoch:  31 	 loss: 76.297 	 return: 152.500 	 ep_len: 152.500
epoch:  32 	 loss: 84.229 	 return: 195.577 	 ep_len: 195.577
epoch:  33 	 loss: 79.466 	 return: 197.269 	 ep_len: 197.269
epoch:  34 	 loss: 76.030 	 return: 200.000 	 ep_len: 200.000
epoch:  35 	 loss: 71.172 	 return: 200.000 	 ep_len: 200.000
epoch:  36 	 loss: 68.637 	 return: 200.000 	 ep_len: 200.000
epoch:  37 	 loss: 65.365 	 return: 200.000 	 ep_len: 200.000
epoch:  38 	 loss: 62.729 	 return: 200.000 	 ep_len: 200.000
epoch:  39 	 loss: 57.279 	 return: 200.000 	 ep_len: 200.000
epoch:  40 	 loss: 55.274 	 return: 200.000 	 ep_len: 200.000
epoch:  41 	 loss: 57.835 	 return: 200.000 	 ep_len: 200.000
epoch:  42 	 loss: 52.464 	 return: 195.231 	 ep_len: 195.231
epoch:  43 	 loss: 48.062 	 return: 198.808 	 ep_len: 198.808
epoch:  44 	 loss: 45.662 	 return: 199.077 	 ep_len: 199.077
epoch:  45 	 loss: 43.374 	 return: 188.185 	 ep_len: 188.185
epoch:  46 	 loss: 42.887 	 return: 187.852 	 ep_len: 187.852
epoch:  

epoch:  10 	 loss: 38.471 	 return: 50.020 	 ep_len: 50.020
epoch:  11 	 loss: 41.569 	 return: 55.198 	 ep_len: 55.198
epoch:  12 	 loss: 42.989 	 return: 56.640 	 ep_len: 56.640
epoch:  13 	 loss: 41.270 	 return: 59.988 	 ep_len: 59.988
epoch:  14 	 loss: 40.997 	 return: 59.153 	 ep_len: 59.153
epoch:  15 	 loss: 38.869 	 return: 56.056 	 ep_len: 56.056
epoch:  16 	 loss: 43.661 	 return: 62.675 	 ep_len: 62.675
epoch:  17 	 loss: 43.333 	 return: 65.455 	 ep_len: 65.455
epoch:  18 	 loss: 41.171 	 return: 62.000 	 ep_len: 62.000
epoch:  19 	 loss: 45.493 	 return: 67.040 	 ep_len: 67.040
epoch:  20 	 loss: 46.646 	 return: 71.457 	 ep_len: 71.457
epoch:  21 	 loss: 41.356 	 return: 62.825 	 ep_len: 62.825
epoch:  22 	 loss: 39.656 	 return: 65.192 	 ep_len: 65.192
epoch:  23 	 loss: 40.536 	 return: 61.889 	 ep_len: 61.889
epoch:  24 	 loss: 45.409 	 return: 66.693 	 ep_len: 66.693
epoch:  25 	 loss: 48.294 	 return: 74.776 	 ep_len: 74.776
epoch:  26 	 loss: 48.578 	 return: 75.4

epoch:  41 	 loss: 18.974 	 return: 80.871 	 ep_len: 80.871
epoch:  42 	 loss: 18.315 	 return: 80.677 	 ep_len: 80.677
epoch:  43 	 loss: 20.335 	 return: 88.895 	 ep_len: 88.895
epoch:  44 	 loss: 20.269 	 return: 83.917 	 ep_len: 83.917
epoch:  45 	 loss: 20.444 	 return: 89.446 	 ep_len: 89.446
epoch:  46 	 loss: 22.279 	 return: 93.074 	 ep_len: 93.074
epoch:  47 	 loss: 23.492 	 return: 103.429 	 ep_len: 103.429
epoch:  48 	 loss: 23.777 	 return: 100.420 	 ep_len: 100.420
epoch:  49 	 loss: 27.704 	 return: 112.733 	 ep_len: 112.733
Agent saved to trained_agents/VPG_Pytorch_l2_s36/agent_20230407-181927.pt
Training with 1 hidden layers and hidden size 32
epoch:   0 	 loss: 16.081 	 return: 18.879 	 ep_len: 18.879
epoch:   1 	 loss: 17.625 	 return: 20.541 	 ep_len: 20.541
epoch:   2 	 loss: 21.421 	 return: 24.539 	 ep_len: 24.539
epoch:   3 	 loss: 28.586 	 return: 31.087 	 ep_len: 31.087
epoch:   4 	 loss: 29.342 	 return: 32.908 	 ep_len: 32.908
epoch:   5 	 loss: 28.118 	 ret

In [12]:
import re

def parse_config(config_str):
    match = re.search(r'l(\d+)_s(\d+)', config_str)
    num_hidden_layers = int(match.group(1))
    hidden_size = int(match.group(2))
    return hidden_size, num_hidden_layers

In [13]:
def create_logits_net(obs_dim, n_acts, hidden_size, num_hidden_layers):
    layer_sizes = [obs_dim] + [hidden_size] * num_hidden_layers + [n_acts]
    return mlp(layer_sizes)

In [14]:
import os
import torch
import pandas as pd
import numpy as np

def get_agent_results(folder='trained_agents'):
    results = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.pt'):
                env_name = 'CartPole-v0'
                env = gym.make(env_name)
                obs_dim = env.observation_space.shape[0]
                n_acts = env.action_space.n
                filepath = os.path.join(root, file)
                config = os.path.basename(root)
                hidden_size, num_hidden_layers = parse_config(config)
                logits_net = create_logits_net(obs_dim, n_acts, hidden_size, num_hidden_layers)
                agent = load_agent(logits_net, filepath)
                
                if not os.path.exists('evaluation_plots'):
                    os.makedirs('evaluation_plots')
                avg_return, avg_ep_len, ep_lengths = evaluate_agent(env, agent, num_episodes=5, verbose=False, render=False)
                date_str = time.strftime("%Y%m%d-%H%M%S")
                plot_filename = os.path.basename(file).replace('.pt', f'_training_plot_{date_str}.png')
                plot_filepath = os.path.join('evaluation_plots', plot_filename)
                save_plot(ep_lengths, plot_filepath)
                results.append({'config': config, 'avg_return': avg_return, 'avg_ep_len': avg_ep_len})
    return results

In [15]:
def save_best_configs_to_csv(results, filename='results.csv', best_configs_filename='best_configs.txt'):
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)

    max_avg_return = df['avg_return'].max()
    best_configs = df[df['avg_return'] == max_avg_return]['config'].values

    with open(best_configs_filename, 'w') as f:
        for config in best_configs:
            f.write(config + '\n')

    return best_configs

In [16]:
results = get_agent_results()
best_configs = save_best_configs_to_csv(results)
env_name = 'CartPole-v0'
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
for best_config in best_configs:
    # Load the best agent
    best_agent_folder = os.path.join('trained_agents', best_config)
    list_of_files = glob.glob(os.path.join(best_agent_folder, 'agent_*.pt'))
    best_agent_path = max(list_of_files, key=os.path.getctime)

    # Get hidden_size and num_hidden_layers from best_config
    hidden_size, num_hidden_layers = parse_config(best_config)

    # Create logits_net for best agent
    best_logits_net = create_logits_net(obs_dim, n_acts, hidden_size, num_hidden_layers)

    # Load agent
    best_agent = load_agent(best_logits_net, best_agent_path)

    # Evaluate the best agent
    print(f"Evaluating agent with config: {best_config}")
    evaluate_agent(env, best_agent, num_episodes=5, verbose=True, render=True)



Loaded agent from trained_agents/VPG_Pytorch_l1_s32/agent_20230407-182248.pt
Episode 1: Total Reward = 167.0, Episode Length = 167
Episode 2: Total Reward = 166.0, Episode Length = 166
Episode 3: Total Reward = 125.0, Episode Length = 125
Episode 4: Total Reward = 128.0, Episode Length = 128
Episode 5: Total Reward = 200.0, Episode Length = 200
Loaded agent from trained_agents/VPG_Pytorch_l3_s22/agent_20230407-182735.pt
Episode 1: Total Reward = 148.0, Episode Length = 148
Episode 2: Total Reward = 109.0, Episode Length = 109
Episode 3: Total Reward = 195.0, Episode Length = 195
Episode 4: Total Reward = 157.0, Episode Length = 157
Episode 5: Total Reward = 83.0, Episode Length = 83
Loaded agent from trained_agents/VPG_Pytorch_l3_s27/agent_20230407-175832.pt
Episode 1: Total Reward = 194.0, Episode Length = 194
Episode 2: Total Reward = 196.0, Episode Length = 196
Episode 3: Total Reward = 84.0, Episode Length = 84
Episode 4: Total Reward = 134.0, Episode Length = 134
Episode 5: Total 



Loaded agent from trained_agents/VPG_Pytorch_l3_s39/agent_20230407-175333.pt
Evaluating agent with config: VPG_Pytorch_l3_s39
Episode 1: Total Reward = 200.0, Episode Length = 200
Episode 2: Total Reward = 200.0, Episode Length = 200
Episode 3: Total Reward = 200.0, Episode Length = 200
Episode 4: Total Reward = 200.0, Episode Length = 200
Episode 5: Total Reward = 200.0, Episode Length = 200
Average Return: 200.000 	 Average Episode Length: 200.000
Loaded agent from trained_agents/VPG_Pytorch_l1_s35/agent_20230407-181100.pt
Evaluating agent with config: VPG_Pytorch_l1_s35
Episode 1: Total Reward = 200.0, Episode Length = 200
Episode 2: Total Reward = 200.0, Episode Length = 200
Episode 3: Total Reward = 200.0, Episode Length = 200
Episode 4: Total Reward = 200.0, Episode Length = 200
Episode 5: Total Reward = 200.0, Episode Length = 200
Average Return: 200.000 	 Average Episode Length: 200.000
