In [1]:
import os
from datetime import date

import numpy as np
import torch
import torch.optim as optim
import gym
import matplotlib.pyplot as plt

from BP_A2C.BP_A2C_agent import A2C_Agent, evaluate_BP_agent

import site
site.addsitedir('../src/')

from backpropamine_A2C import BP_RNetwork

%matplotlib inline

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dtype = torch.float

In [3]:
# Environment specific parameters
env_name = 'CartPole-v0'
max_reward = 200
max_steps = 200

n_evaluations = 100
num_fine_tuning_episodes = 250
num_episodes = 1000

In [4]:
num_training_episodes = 3000
num_evaluation_episodes = 100

In [5]:
# A2C hyperparameters
entropy_coef = 0.03 
value_pred_coef = 0.1 
gammaR = 0.99
max_grad_norm = 4.0
batch_size = 1
print_every = 10
save_every = 50
selection_method = "evaluation"

In [6]:
# Adam hyperparameters
learning_rate = 1e-4 # For Adam optimizer
l2_coef = 0

In [7]:
evaluation_seeds = np.load('rstdp_cartpole_stuff/seeds/evaluation_seeds.npy')
fine_tuning_seeds = np.load('rstdp_cartpole_stuff/seeds/rstdp_training_seeds.npy')

In [8]:
# Create Results Directory
dirs = os.listdir('./BP_A2C/fine_tuning_results')
if not any('fine_tuning_result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'fine_tuning_result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'BP_A2C/fine_tuning_results/BP_A2C_RNN_fine_tuning_result_' + str(result_id) + '_{}'.format(
    str(d.year) + str(d.month) + str(d.day))
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

Created Directory BP_A2C/fine_tuning_results/BP_A2C_RNN_fine_tuning_result_2_20231016 to store the results in


In [9]:
# Load pre-trained model weights
weights_0 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_0.pt', map_location=torch.device(device))
weights_1 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_1.pt', map_location=torch.device(device))
weights_2 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_2.pt', map_location=torch.device(device))
weights_3 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_3.pt', map_location=torch.device(device))
weights_4 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_4.pt', map_location=torch.device(device))
weights = [weights_0, weights_1, weights_2, weights_3, weights_4]

In [83]:
percentages = np.linspace(1.1, 2.0, 10)
for percentage in percentages:
    print(f"PERCENTAGE: {percentage}")
    for i_run, w in enumerate(weights):
# i_run = 2
# w = weights[i_run]

        agent_net = BP_RNetwork(4, 64, 2, 5).to(device)
        agent_net.loadWeights(w)


        # optimizer = torch.optim.Adam(agent_net.parameters(), lr = learning_rate)
        # agent = A2C_Agent(env_name, 12, agent_net, entropy_coef, value_pred_coef, gammaR,
        #                 max_grad_norm, max_steps, batch_size, num_training_episodes, optimizer, print_every,
        #                 save_every, i_run, result_dir, selection_method, num_evaluation_episodes, evaluation_seeds, max_reward)

        eval_rewards = evaluate_BP_agent(agent_net, env_name, n_evaluations, evaluation_seeds, percentage)
        print(f"model {i_run}: {np.mean(eval_rewards)}")

PERCENTAGE: 1.1


model 0: 200.0
model 1: 200.0
model 2: 199.98
model 3: 200.0
model 4: 200.0
PERCENTAGE: 1.2000000000000002
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 200.0
PERCENTAGE: 1.3
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 200.0
PERCENTAGE: 1.4000000000000001
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 200.0
PERCENTAGE: 1.5
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 198.22
PERCENTAGE: 1.6
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 153.3
PERCENTAGE: 1.7000000000000002
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 30.2
PERCENTAGE: 1.8
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 21.26
PERCENTAGE: 1.9
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 21.47
PERCENTAGE: 2.0
model 0: 200.0
model 1: 200.0
model 2: 200.0
model 3: 200.0
model 4: 20.38


In [10]:
percentages = np.linspace(1.1, 2.0, 10)
seed = 59
eval_skip = 1
avg_best_rewards = []
std_dev_best_rewards = []
avg_best_episodes = []
std_dev_best_episodes = []
for percentage in percentages:
    print(f"PERCENTAGE: {percentage}------------\n")
    best_rewards = []
    best_episodes = []

    # modified_env = gym.make(env_name)
    # modified_env.unwrapped.length *= percentage
    # weights_0 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_0.pt', map_location=torch.device(device))
    # weights_1 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_1.pt', map_location=torch.device(device))
    # weights_2 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_2.pt', map_location=torch.device(device))
    # weights_3 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_3.pt', map_location=torch.device(device))
    # weights_4 = torch.load('BP_A2C/results/a2c_result_2_20231014_entropycoef_0.03_valuepredcoef_0.1_batchsize_128_maxsteps_200_maxgradnorm_4.0_gammaR_0.99_l2coef_0_learningrate_0.0001_numtrainepisodes_3000_selectionmethod_evaluation/checkpoint_BP_A2C_4.pt', map_location=torch.device(device))
    # weights = [weights_0, weights_1, weights_2, weights_3, weights_4]
    for i_run, w in enumerate(weights):

        agent_net = BP_RNetwork(4, 64, 2, 5).to(device)
        agent_net.loadWeights(w)
        

        optimizer = torch.optim.Adam(agent_net.parameters(), lr = learning_rate)
        agent = A2C_Agent(env_name, seed, agent_net, entropy_coef, value_pred_coef, gammaR,
                      max_grad_norm, max_steps, batch_size, num_training_episodes, optimizer, print_every,
                      save_every, i_run, result_dir, selection_method, num_evaluation_episodes, evaluation_seeds, max_reward)

        
        eval_rewards = evaluate_BP_agent(agent.agent_net, env_name, n_evaluations, evaluation_seeds, percentage)
        print(f"Before fine-tuning: {np.mean(eval_rewards)}")
        


        fine_tuned_weights, best_reward, best_episode = agent.fine_tune_agent(num_fine_tuning_episodes, eval_skip, fine_tuning_seeds, percentage, n_evaluations, evaluation_seeds, max_reward)

        best_rewards.append(best_reward)
        best_episodes.append(best_episode)

            
    avg_best_rewards.append(np.mean(best_rewards))
    std_dev_best_rewards.append(np.std(best_rewards))
    avg_best_episodes.append(np.mean(best_episodes))
    std_dev_best_episodes.append(np.std(best_episodes))

PERCENTAGE: 1.1------------





Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 199.98




Episode:    1 -- Reward:  200.00 -- Best reward:    -inf in episode    0
Best individual stored after episode 1 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

PERCENTAGE: 1.2000000000000002------------

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already reached before fine-tuning

Best individual stored after episode 0 with reward 200.00

Before fine-tuning: 200.0
Maximum evaluation performance already

KeyboardInterrupt: 

In [109]:
print(avg_best_rewards)
print(std_dev_best_rewards)
print(avg_best_episodes)
print(std_dev_best_episodes)

[200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.2, 0.0, 0.0, 0.0, 24.2, 24.0, 24.0, 24.0, 23.8, 23.8]
[0.4000000000000001, 0.0, 0.0, 0.0, 48.4, 48.0, 48.0, 48.0, 47.6, 47.6]


In [110]:
np.save(f"{result_dir}/best_rewards_correct.npy", [avg_best_rewards, std_dev_best_rewards])
np.save(f"{result_dir}/best_episodes_correct.npy", [avg_best_episodes, std_dev_best_episodes])
[200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.2, 0.0, 0.0, 0.0, 24.2, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.4000000000000001, 0.0, 0.0, 0.0, 48.4, 0.0, 0.0, 0.0, 0.0, 0.0]

[0.4000000000000001, 0.0, 0.0, 0.0, 48.4, 0.0, 0.0, 0.0, 0.0, 0.0]