In [23]:
import os
from datetime import date

import numpy as np
import torch
import torch.optim as optim
import random
import gym
import matplotlib.pyplot as plt
from copy import deepcopy

from cartpole_stuff.src.utils import evaluate_policy, rstdp_train_cartpole, evaluate_BP_policy
from cartpole_stuff.src.dqn_agent import Agent, ReplayBuffer

import site
site.addsitedir('../src/')

from cartpole_stuff.src.dsnn import RSTDPNet
from backpropamine_DQN import BP_RNetwork, Standard_RNetwork

%matplotlib inline

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dtype = torch.float

In [25]:
# Environment specific parameters
env_name = 'CartPole-v0'
max_reward = 200
max_steps = 200

n_evaluations = 100
num_fine_tuning_episodes = 250
num_episodes = 1000

In [26]:
# Create Results Directory
dirs = os.listdir('.')
if not any('fine_tuning_result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'fine_tuning_result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'BP_fine_tuning_result_' + str(result_id) + '_{}'.format(
    str(d.year) + str(d.month) + str(d.day))
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

Created Directory BP_fine_tuning_result_5_20231010 to store the results in


In [27]:
# Hyperparameters
batch_size = 128
discount_factor = 0.999
eps_start = 1.0
eps_end = 0.05
eps_decay = 0.999
update_every = 4
target_update_frequency = 100
learning_rate = 0.001 # lr is 0.0001 for simple maze as default
l2_coef = 0 # 0 is default in simple maze task
replay_memory_size = 4*10**4
tau = 1e-3

In [28]:
# Create environments
original_env = gym.make(env_name)



In [29]:
# SNN Hyperparameters
simulation_time = 8
alpha = 0.8
beta = 0.8
threshold = 0.5
weight_scale = 1
architecture = [4, 64, 64, 2]

In [30]:
evaluation_seeds = np.load('cartpole_stuff/seeds/evaluation_seeds.npy')
fine_tuning_seeds = np.load('cartpole_stuff/seeds/rstdp_training_seeds.npy')

In [31]:
# Load pre-trained model weights
weights_0 = torch.load('simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_0.pt', map_location=torch.device(device))
weights_1 = torch.load('simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_1.pt', map_location=torch.device(device))
weights_2 = torch.load('simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_2.pt', map_location=torch.device(device))
weights_3 = torch.load('simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_3.pt', map_location=torch.device(device))
weights_4 = torch.load('simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_4.pt', map_location=torch.device(device))
weights = [weights_0, weights_1, weights_2, weights_3, weights_4]

In [32]:
BP_finetuning_rewards = []
adapted_weights_collection = []
percentages = np.linspace(1.1, 2.0, 10)
# percentages = [1.1, 1.5, 2.0]
seed = 59
eval_skip = 1
avg_best_rewards = []
avg_best_episodes = []
for percentage in percentages:
    print(f"PERCENTAGE: {percentage}------------\n")
    best_rewards = []
    best_episodes = []

    modified_env = gym.make(env_name)
    modified_env.unwrapped.length *= percentage

    for i_run, w in enumerate(weights):

        policy_net = BP_RNetwork(4, 64, 2, 5).to(device)
        policy_net.loadWeights(w)
        target_net = BP_RNetwork(4, 64, 2, 5).to(device)
        target_net.load_state_dict(policy_net.state_dict())

        optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate, weight_decay = l2_coef) 
        agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                    replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                    update_every, target_update_frequency, optimizer, learning_rate,
                    num_episodes, max_steps, i_run, result_dir, seed, tau)
            
        fine_tuned_weights, best_reward, best_episode = agent.fine_tune_agent(num_fine_tuning_episodes, eval_skip, fine_tuning_seeds, modified_env, n_evaluations, evaluation_seeds, max_reward)

        best_rewards.append(best_reward)
        best_episodes.append(best_episode)

            
            # BP_finetuning_rewards.append(rewards)
            # adapted_weights_collection.append(adapted_weights)
    avg_best_rewards.append(np.mean(best_rewards))
    avg_best_episodes.append(np.mean(best_episodes))
        
    # adapted_weights_collection = [(list(aw.values()), []) for aw in adapted_weights_collection]

PERCENTAGE: 1.1------------

Episode:    7 -- Reward:  200.00 -- Best reward:  156.01 in episode    1
Best individual stored after episode 7 with reward 200.00

Episode:    3 -- Reward:  200.00 -- Best reward:  172.12 in episode    1
Best individual stored after episode 3 with reward 200.00

Episode:    1 -- Reward:  200.00 -- Best reward:    -inf in episode   -1
Best individual stored after episode 1 with reward 200.00

Episode:    2 -- Reward:  200.00 -- Best reward:  115.00 in episode    1
Best individual stored after episode 2 with reward 200.00

Episode:   14 -- Reward:  200.00 -- Best reward:  157.76 in episode   13
Best individual stored after episode 14 with reward 200.00

PERCENTAGE: 1.5------------

Episode:  100 -- Reward:   64.70 -- Best reward:  167.83 in episode   33
Best individual stored after episode 33 with reward 167.83

Episode:   24 -- Reward:  200.00 -- Best reward:  198.15 in episode   17
Best individual stored after episode 24 with reward 200.00

Episode:  100 -

In [22]:
print(avg_best_rewards)
print(avg_best_episodes)

[200.0, 197.326, 187.166]
[7.0, 17.4, 29.0]
