In [9]:
import torch
import numpy as np
import random
import os
from datetime import date

from backpropamine_A2C import BP_RNetwork, Standard_RNetwork
from BP_A2C.BP_A2C_agent import A2C_Agent

%matplotlib inline

In [10]:
# Environment specific parameters
env_name = 'CartPole-v0'
n_runs = 10
n_evaluations = 100
max_steps = 200
num_training_episodes = 3000
num_evaluation_episodes = 100
max_reward = 200

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
# A2C hyperparameters
entropy_coef = 0.03 
value_pred_coef = 0.1 
gammaR = 0.99
max_grad_norm = 4.0
batch_size = 128
print_every = 10
save_every = 50

In [12]:
# Adam hyperparameters
learning_rate = 1e-4 # For Adam optimizer
l2_coef = 0

In [13]:
selection_method = "evaluation"

In [14]:
training_seeds = np.load('rstdp_cartpole_stuff/seeds/training_seeds.npy')
evaluation_seeds = np.load('rstdp_cartpole_stuff/seeds/evaluation_seeds.npy')

In [15]:
# Create Results Directory
dirs = os.listdir('./BP_A2C/results/')
if not any('a2c_result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'a2c_result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'BP_A2C/results/a2c_result_' + str(result_id) + "_{}_entropycoef_{}_valuepredcoef_{}_batchsize_{}_maxsteps_{}_\
maxgradnorm_{}_gammaR_{}_l2coef_{}_learningrate_{}_numtrainepisodes_{}_selectionmethod_{}".format(
    str(d.year) + str(d.month) + str(d.day), entropy_coef, value_pred_coef, batch_size, max_steps, max_grad_norm, gammaR,
    l2_coef, learning_rate, num_training_episodes, selection_method)
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

Created Directory BP_A2C/results/a2c_result_4_20231016_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation to store the results in


In [16]:
smoothed_scores_dqn_all = []
dqn_completion_after = []

for i_run in range(n_runs):
    print("Run # {}".format(i_run))
    seed = int(training_seeds[i_run])
    
    torch.manual_seed(seed)
    random.seed(seed)

    agent_net = Standard_RNetwork(4, 64, 2, seed)
    
    # optimizer = torch.optim.Adam(agent_net.parameters(), lr=1.0*learning_rate, eps=1e-4, weight_decay=l2_coef)
    optimizer = torch.optim.Adam(agent_net.parameters(), lr = learning_rate)
    agent = A2C_Agent(env_name, seed, agent_net, entropy_coef, value_pred_coef, gammaR,
                      max_grad_norm, max_steps, batch_size, num_training_episodes, optimizer, print_every,
                      save_every, i_run, result_dir, selection_method, num_evaluation_episodes, evaluation_seeds, max_reward)

    smoothed_scores, scores, best_average, best_average_after = agent.train_agent()

    # # policy_net = QNetwork(architecture, seed).to(device)
    # # target_net = QNetwork(architecture, seed).to(device)
    # policy_net = BP_RNetwork(4, 64, 2, seed).to(device)
    # target_net = BP_RNetwork(4, 64, 2, seed).to(device)

    # target_net.load_state_dict(policy_net.state_dict())

    # # optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    # optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate, weight_decay = l2_coef) 
    # agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
    #               replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
    #               update_every, target_update_frequency, optimizer, learning_rate,
    #               num_episodes, max_steps, i_run, result_dir, seed, tau)
    
    # smoothed_scores, scores, best_average, best_average_after = agent.train_agent()

    # np.save(result_dir + '/scores_{}'.format(i_run), scores)
    # np.save(result_dir + '/smoothed_scores_DQN_{}'.format(i_run), smoothed_scores)

    # # save smoothed scores in list to plot later
    # dqn_completion_after.append(best_average_after)
    # smoothed_scores_dqn_all.append(smoothed_scores)
    # print("")

Run # 0
Batch size larger than 1 not implemented yet. Program will continue with batch size set to 1.
Episode 10	Average evaluation: 40.3
Episode 20	Average evaluation: 11.95
Episode 30	Average evaluation: 9.42
Episode 40	Average evaluation: 9.7
Episode 50	Average evaluation: 15.6
Episode 60	Average evaluation: 12.54
Episode 70	Average evaluation: 10.85
Episode 80	Average evaluation: 14.05
Episode 90	Average evaluation: 17.38
Episode 100	Average evaluation: 18.66
Episode 110	Average evaluation: 19.85
Episode 120	Average evaluation: 26.94
Episode 130	Average evaluation: 34.42
Episode 140	Average evaluation: 31.75
Episode 150	Average evaluation: 31.62
Episode 160	Average evaluation: 31.86
Episode 170	Average evaluation: 32.83
Episode 180	Average evaluation: 61.37
Episode 190	Average evaluation: 73.44
Episode 200	Average evaluation: 71.58
Episode 210	Average evaluation: 70.3
Episode 220	Average evaluation: 78.12
Episode 230	Average evaluation: 82.82
Episode 240	Average evaluation: 48.92
E