In [1]:
import os
from datetime import date

import numpy as np
import torch
import torch.optim as optim
import random
import gym
import matplotlib.pyplot as plt
from copy import deepcopy

from rstdp_cartpole_stuff.src.dqn_agent import Agent, ReplayBuffer

import site
site.addsitedir('../src/')


from backpropamine_DQN import BP_RNetwork, Standard_RNetwork

%matplotlib inline

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dtype = torch.float

In [3]:
# Environment specific parameters
env_name = 'CartPole-v0'
max_reward = 200
max_steps = 200

n_evaluations = 100
num_fine_tuning_episodes = 250
num_episodes = 1000

In [4]:
# Create Results Directory
dirs = os.listdir('.')
if not any('fine_tuning_result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'fine_tuning_result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'BP_fine_tuning_result_' + str(result_id) + '_{}'.format(
    str(d.year) + str(d.month) + str(d.day))
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

Created Directory BP_fine_tuning_result_5_20231016 to store the results in


In [5]:
# Hyperparameters
batch_size = 128
discount_factor = 0.999
eps_start = 1.0
eps_end = 0.05
eps_decay = 0.999
update_every = 4
target_update_frequency = 100
learning_rate = 0.001 # lr is 0.0001 for simple maze as default
l2_coef = 0 # 0 is default in simple maze task
replay_memory_size = 4*10**4
tau = 1e-3

In [6]:
# Create environments
original_env = gym.make(env_name)





In [7]:
# SNN Hyperparameters
simulation_time = 8
alpha = 0.8
beta = 0.8
threshold = 0.5
weight_scale = 1
architecture = [4, 64, 64, 2]

In [8]:
evaluation_seeds = np.load('rstdp_cartpole_stuff/seeds/evaluation_seeds.npy')
fine_tuning_seeds = np.load('rstdp_cartpole_stuff/seeds/rstdp_training_seeds.npy')

In [9]:
# Load pre-trained model weights
weights_0 = torch.load('trained_models/simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_0.pt', map_location=torch.device(device))
weights_1 = torch.load('trained_models/simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_1.pt', map_location=torch.device(device))
weights_2 = torch.load('trained_models/simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_2.pt', map_location=torch.device(device))
weights_3 = torch.load('trained_models/simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_3.pt', map_location=torch.device(device))
weights_4 = torch.load('trained_models/simple_BP_DQRNN_training_1000eps/checkpoint_BP_DQRNN_4.pt', map_location=torch.device(device))
weights = [weights_0, weights_1, weights_2, weights_3, weights_4]

In [10]:
percentages = np.linspace(1.1, 2.0, 10)
seed = 59
eval_skip = 1
avg_best_rewards = []
std_dev_best_rewards = []
avg_best_episodes = []
std_dev_best_episodes = []
for percentage in percentages:
    print(f"PERCENTAGE: {percentage}------------\n")
    best_rewards = []
    best_episodes = []

    modified_env = gym.make(env_name)
    modified_env.unwrapped.length *= percentage
    
    for i_run, w in enumerate(weights):

        policy_net = BP_RNetwork(4, 64, 2, 5).to(device)
        policy_net.loadWeights(w)
        target_net = BP_RNetwork(4, 64, 2, 5).to(device)
        target_net.load_state_dict(policy_net.state_dict())

        optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate, weight_decay = l2_coef) 
        agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                    replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                    update_every, target_update_frequency, optimizer, learning_rate,
                    num_episodes, max_steps, i_run, result_dir, seed, tau)
            
        fine_tuned_weights, best_reward, best_episode = agent.fine_tune_agent(num_fine_tuning_episodes, eval_skip, fine_tuning_seeds, modified_env, n_evaluations, evaluation_seeds, max_reward)

        best_rewards.append(best_reward)
        best_episodes.append(best_episode)

            
    avg_best_rewards.append(np.mean(best_rewards))
    std_dev_best_rewards.append(np.std(best_rewards))
    avg_best_episodes.append(np.mean(best_episodes))
    std_dev_best_episodes.append(np.std(best_episodes))
        

PERCENTAGE: 1.1------------

Episode:    7 -- Reward:  200.00 -- Best reward:  156.01 in episode    1
Best individual stored after episode 7 with reward 200.00

Episode:    3 -- Reward:  200.00 -- Best reward:  172.12 in episode    1
Best individual stored after episode 3 with reward 200.00

Episode:    1 -- Reward:  200.00 -- Best reward:    -inf in episode   -1
Best individual stored after episode 1 with reward 200.00

Episode:    2 -- Reward:  200.00 -- Best reward:  115.00 in episode    1
Best individual stored after episode 2 with reward 200.00

Episode:   14 -- Reward:  200.00 -- Best reward:  157.76 in episode   13
Best individual stored after episode 14 with reward 200.00

PERCENTAGE: 1.2000000000000002------------

Episode:   42 -- Reward:  200.00 -- Best reward:  194.45 in episode   40
Best individual stored after episode 42 with reward 200.00

Episode:    1 -- Reward:  150.83 -- Best reward:    -inf in episode   -1

KeyboardInterrupt: 

In [14]:
print(avg_best_rewards)
print(std_dev_best_rewards)
print(avg_best_episodes)
print(std_dev_best_episodes)

[200.0, 200.0, 200.0, 200.0, 190.478, 199.02599999999998, 186.132, 192.072, 162.158, 165.594]
[0.0, 0.0, 0.0, 0.0, 19.044000000000004, 1.5233857029655964, 26.669124020109848, 9.949157552275466, 46.67789301157455, 43.52876363969002]
[5.4, 31.2, 79.8, 75.8, 103.6, 112.2, 112.8, 157.0, 133.4, 120.8]
[4.758150901348127, 30.87005021051958, 77.4477888644989, 75.13028683560313, 98.17046398993945, 93.97318766541869, 74.46985967490473, 60.64981450919698, 57.37804458152961, 54.590841722765184]


In [16]:
np.save("fine_tuning_results/BP_DQRNN_1000eps_best_rewards_correct.npy", [avg_best_rewards, std_dev_best_rewards])
np.save("fine_tuning_results/BP_DQRNN_1000eps_best_episodes_correct.npy", [avg_best_episodes, std_dev_best_episodes])