## **Imports, and Checking PyTorch Device**

In [None]:
from utils import DQN, ReplayBuffer, greedy_action, epsilon_greedy, update_target, loss

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import math
import numpy as np

import gym
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

## **Part 1: DQN Tuning**
At each point in the following segment, we vary one specific hyperparameter, keeping the others fixed. As a consequence of this, the other hyperparameters are typically not the ideal one *before* that hyperparameter's been tested. This doesn't significanly matter, however, as all fixed hyperparameters are chosen reasonably so that the model can still provide representative results of how varying hyperparameters affect return

#### **Hyperparameter 1:** Architecture

In [None]:
############ These are the hyperparameters which we change in the DQN
architecture = [[4, 4, 2], [4, 8, 2], [4, 16, 2], [4, 32, 2], [4, 64, 2],
                [4, 4, 4, 2], [4, 8, 4, 2], [4, 16, 8, 2], [4, 32, 16, 2], [4, 64, 32, 2]]
buffer_size = 50000
batch_size = 128
update_frequency = 1
learning_rate = 0.001
NUM_RUNS = 10
EPSILON = 1
epsilon_decay = 0.9
min_epsilon = 0.1
architecture_runs = []

In [None]:
# Array to store results per run
runs_results = []
# Make CartPole environment
env = gym.make('CartPole-v1')
# For each architecture in the architectures
for arch_runs in range(len(architecture)):
    # Print which architecture we're currently testing
    print('Architecture: ', architecture[arch_runs])
    runs_results = []
    # For each run given in NUM_RUNS
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture[arch_runs])
        target_net = DQN(architecture[arch_runs])
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an Adam optimizer on the policy network parameters, with the chosen learning rate
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
        # Create a memory buffer of size buffer_size
        memory = ReplayBuffer(buffer_size)

        # Number of steps done
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range, defined as 300 in the problem statement
        for i_episode in range(300):
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment, obtain an observation
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            # This defines whether we've finished this episode or not, where end is either when we've reached
            # 500, or when the pole drops
            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):

                # Pick the epsilon greedy action from the policy network
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step in the direction of the action
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the new observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network), if we've reached batch size
                if not len(memory.buffer) < batch_size:
                    # Get a random sample from the buffer of size batch_size
                    transitions = memory.sample(batch_size)
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                # Increment the timestep
                t = t + 1

                # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION - Epsilon here decays to 0.1
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay
                else:
                    EPSILON = min_epsilon

            # Update the target network, copying all weights and biases in DQN only at the update_frequency defined
            if i_episode % update_frequency == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episodes to results of run
        runs_results.append(episode_durations)
    # Append runs of specific architecture to that architecture's runs
    architecture_runs.append(runs_results)
print('Complete')

In [None]:
# ############# SAVE ARCHITECTURE RUNS TO LOAD EASIER
# import pickle
# with open ('architecture_1Update.pkl', 'wb') as f:
#     pickle.dump(architecture_runs, f) 

In [None]:
############# PLOT ARCHITECTURE RUNS
# Get the architecture run array as a tensor, find mean and STD for each
results0 = torch.tensor(architecture_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(architecture_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(architecture_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(architecture_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

results4 = torch.tensor(architecture_runs[4])
means4 = results4.float().mean(0)
stds4 = results4.float().std(0)

results5 = torch.tensor(architecture_runs[5])
means5 = results5.float().mean(0)
stds5 = results5.float().std(0)

results6 = torch.tensor(architecture_runs[6])
means6 = results6.float().mean(0)
stds6 = results6.float().std(0)

results7 = torch.tensor(architecture_runs[7])
means7 = results7.float().mean(0)
stds7 = results7.float().std(0)

results8 = torch.tensor(architecture_runs[8])
means8 = results8.float().mean(0)
stds8 = results8.float().std(0)

results9 = torch.tensor(architecture_runs[9])
means9 = results9.float().mean(0)
stds9 = results9.float().std(0)

# Plot each architecture mean
plt.plot(torch.arange(300), means0, label="Architecture = [4, 4, 2]")
plt.plot(torch.arange(300), means1, label="Architecture = [4, 8, 2]")
plt.plot(torch.arange(300), means2, label="Architecture = [4, 16 2]")
plt.plot(torch.arange(300), means3, label="Architecture = [4, 32, 2]")
plt.plot(torch.arange(300), means4, label="Architecture = [4, 64, 2]")
plt.plot(torch.arange(300), means5, label="Architecture = [4, 4, 4, 2]")
plt.plot(torch.arange(300), means6, label="Architecture = [4, 8, 4, 2]")
plt.plot(torch.arange(300), means7, label="Architecture = [4, 16, 8, 2]")
plt.plot(torch.arange(300), means8, label="Architecture = [4, 32, 16, 2]")
plt.plot(torch.arange(300), means9, label="Architecture = [4, 64, 32, 2]")
# Plot each architecture STD
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means4-stds4, means4+stds4, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means5-stds5, means5+stds5, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means6-stds6, means6+stds6, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means7-stds7, means7+stds7, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means8-stds8, means8+stds8, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means9-stds9, means9+stds9, alpha=0.2, color='dimgray')

# Define plot parameters - x label, y label, the 100 horizontal line, etc
plt.ylabel("Return")
plt.xlabel("Episode")
plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Network Architecture Learning Curves")
# Save the architecture figure as a jpg
# plt.savefig("hana_architectures_DiffArch_P2_1Update.jpg")

# Show figure
plt.show()

#### **Hyperparameter 2:** Epsilon Decay

In [None]:
# Defining final architecture, as chosen from above
architecture_final = [4, 64, 32, 2]
buffer_size = 50000
batch_size = 128
update_frequency = 1
learning_rate = 0.001
NUM_RUNS = 10
# This is what we vary this time - epsilon decays
EPSILON = 1
epsilon_decay_range = [0.99, 0.95, 0.9, 0.8]
min_epsilon = 0.1
epsilon_runs = []

In [None]:
############# RUNNING CODE FOR DQN TESTING
# Array to store results per run
runs_results = []
# Make CartPole environment
env = gym.make('CartPole-v1')
for i in range(len(epsilon_decay_range)):
    print('Epsilon: ', epsilon_decay_range[i])
    runs_results = []
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture_final)
        target_net = DQN(architecture_final)
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an SGC optimizer on the policy network parameters - CHANGE
        #optimizer = optim.adam()
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
        # Create a memory buffer with 1 item
        memory = ReplayBuffer(buffer_size)

        # Number of steps done per run
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range
        for i_episode in range(300):
            # Reset epsilon every episode
            EPSILON = 1
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):
 
                # Select and perform an action
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                if not len(memory.buffer) < batch_size:
                    # Get a random sample from the buffer
                    transitions = memory.sample(batch_size)
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                t = t + 1

                 # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay_range[i]
                else:
                    EPSILON = min_epsilon
               
            # Update the target network, copying all weights and biases in DQN
            if i_episode % update_frequency == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episode to results of run
        runs_results.append(episode_durations)
    epsilon_runs.append(runs_results)
print('Complete')

In [None]:
# ############# Save results of epsilon varying
# import pickle
# with open ('epsilon_1Update.pkl', 'wb') as f:
#     pickle.dump(epsilon_runs, f) 

In [None]:
############# Plot results of epsilon varying
results0 = torch.tensor(epsilon_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(epsilon_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(epsilon_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(epsilon_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

plt.plot(torch.arange(300), means0, label="Epsilon = 0.99")
plt.plot(torch.arange(300), means1, label="Epsilon = 0.95")
plt.plot(torch.arange(300), means2, label="Epsilon = 0.9")
plt.plot(torch.arange(300), means3, label="Epsilon = 0.8")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Different Epsilon Learning Curves")
# plt.savefig("hana_architectures_DiffEpChangeInEp_1Update.jpg")
plt.show()

*Here, we just want to see how fast epsilon decays as we progress through the episodes*

In [None]:
numbers = [i for i in range(300)]
epsilonRange = [0.99, 0.95, 0.9, 0.8]
epsilon99 = [1 * math.pow(0.99, i) for i in range(300)]
epsilon95 = [1 * math.pow(0.95, i) for i in range(300)]
epsilon9 = [1 * math.pow(0.9, i) for i in range(300)]
epsilon8 = [1 * math.pow(0.8, i) for i in range(300)]
plt.plot(numbers, epsilon99, label="Epsilon = 0.99")
plt.plot(numbers, epsilon95, label="Epsilon = 0.95")
plt.plot(numbers, epsilon9, label="Epsilon = 0.9")
plt.plot(numbers, epsilon8, label="Epsilon = 0.8")
plt.legend(loc="upper right")
plt.title("Different Epsilon Decay Rates")
# plt.savefig("hana_architectures_DiffEpDecay.jpg")
plt.show()

#### **Hyperparameter 3:** Buffer Size

In [None]:
# Defining final architecture, as chosen from above
architecture_final = [4, 64, 32, 2]
# This is what we vary this time - buffer size
buffer_size = [10000, 30000, 50000, 60000, 80000, 100000]
batch_size = 128
update_frequency = 1
learning_rate = 0.001
NUM_RUNS = 10
EPSILON = 1
epsilon_decay = 0.8
min_epsilon = 0.1
buffer_runs = []

In [None]:
runs_results = []
env = gym.make('CartPole-v1')
for i in range(len(buffer_size)):
    print('Buffer: ', buffer_size[i])
    runs_results = []
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture_final)
        target_net = DQN(architecture_final)
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an SGC optimizer on the policy network parameters - CHANGE
        #optimizer = optim.adam()
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
        # Create a memory buffer with 1 item
        memory = ReplayBuffer(buffer_size[i])

        # Number of steps done per run
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range
        for i_episode in range(300):
            # Reset epsilon
            EPSILON = 1
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):

                # Select and perform an action
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                if not len(memory.buffer) < batch_size:
                    # Get a random sample from the buffer
                    transitions = memory.sample(batch_size)
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                t = t + 1

                # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay
                else:
                    EPSILON = min_epsilon
                
            # Update the target network, copying all weights and biases in DQN
            if i_episode % update_frequency == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episode to results of run
        runs_results.append(episode_durations)
    buffer_runs.append(runs_results)
print('Complete')

In [None]:
# ########## Save results of each buffer run
# import pickle
# with open ('buffer_1Update.pkl', 'wb') as f:
#     pickle.dump(buffer_runs, f) 

In [None]:

results0 = torch.tensor(buffer_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(buffer_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(buffer_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(buffer_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

results4 = torch.tensor(buffer_runs[4])
means4 = results4.float().mean(0)
stds4 = results4.float().std(0)

results5 = torch.tensor(buffer_runs[5])
means5 = results5.float().mean(0)
stds5 = results5.float().std(0)

plt.plot(torch.arange(300), means0, label="Buffer size = 10000")
plt.plot(torch.arange(300), means1, label="Buffer size = 30000")
plt.plot(torch.arange(300), means2, label="Buffer size = 50000")
plt.plot(torch.arange(300), means3, label="Buffer size = 60000")
plt.plot(torch.arange(300), means4, label="Buffer size = 80000")
plt.plot(torch.arange(300), means5, label="Buffer size = 100000")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means4-stds4, means4+stds4, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means5-stds5, means5+stds5, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Different Buffer Size Learning Curves")
# plt.savefig("hana_architectures_DiffBuffer_1Update.jpg")
plt.show()

#### **Hyperparameter 4:** Update Frequency

In [None]:
# Defining final architecture, as chosen from above
architecture_final = [4, 64, 32, 2]
buffer_size = 60000
batch_size = 128
# This is what we vary this time - update frequency
update_frequency = [1, 10, 50, 100, 200]
learning_rate = 0.001
NUM_RUNS = 10
EPSILON = 1
epsilon_decay = 0.8
min_epsilon = 0.1
update_runs = []

In [None]:
runs_results = []

env = gym.make('CartPole-v1')
for i in range(len(update_frequency)):
    print('Update Freq: ', update_frequency[i])
    runs_results = []
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture_final)
        target_net = DQN(architecture_final)
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an SGC optimizer on the policy network parameters - CHANGE
        #optimizer = optim.adam()
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
        # Create a memory buffer with 1 item
        memory = ReplayBuffer(buffer_size)

        # Number of steps done per run
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range
        for i_episode in range(300):
            # Reset epsilon
            EPSILON = 1
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):

                # Select and perform an action
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                if not len(memory.buffer) < batch_size:
                    # Get a random sample from the buffer
                    transitions = memory.sample(batch_size)
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                t = t + 1

                # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay
                else:
                    EPSILON = min_epsilon
        
            # Update the target network, copying all weights and biases in DQN
            if i_episode % update_frequency[i] == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episode to results of run
        runs_results.append(episode_durations)
    update_runs.append(runs_results)
print('Complete')


In [None]:
# ######### Save results of different update frequencies
# import pickle
# with open ('updateFreq.pkl', 'wb') as f:
#     pickle.dump(update_runs, f) 

In [None]:
########## Plot update frequency results
results0 = torch.tensor(update_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(update_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(update_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(update_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

results4 = torch.tensor(update_runs[4])
means4 = results4.float().mean(0)
stds4 = results4.float().std(0)

plt.plot(torch.arange(300), means0, label="Update Freq = 1")
plt.plot(torch.arange(300), means1, label="Update Freq = 10")
plt.plot(torch.arange(300), means2, label="Update Freq = 50")
plt.plot(torch.arange(300), means3, label="Update Freq = 100")
plt.plot(torch.arange(300), means4, label="Update Freq = 200")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means4-stds4, means4+stds4, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Different Update Frequency Learning Curves")
# plt.savefig("hana_architectures_DiffUpdate_1Update.jpg")
plt.show()

#### **Hyperparameter 5:** Learning Rate

In [None]:
# Defining final architecture, as chosen from above
architecture_final = [4, 64, 32, 2]
buffer_size = 60000
batch_size = 128
update_frequency = 1
# This is what we vary this time - learning rates
learning_rate = [0.001, 0.0025, 0.01, 0.1]
NUM_RUNS = 10
EPSILON = 1
epsilon_decay = 0.8
min_epsilon = 0.1
learning_runs = []

In [None]:
runs_results = []
env = gym.make('CartPole-v1')
for i in range(len(learning_rate)):
    print('Learning Rate: ', learning_rate[i])
    runs_results = []
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture_final)
        target_net = DQN(architecture_final)
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an SGC optimizer on the policy network parameters - CHANGE
        #optimizer = optim.adam()
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate[i])
        # Create a memory buffer with 1 item
        memory = ReplayBuffer(buffer_size)

        # Number of steps done per run
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range
        for i_episode in range(300):
            # Reset epsilon
            EPSILON = 1
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):

                # Select and perform an action
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                if not len(memory.buffer) < batch_size:
                    # Get a random sample from the buffer
                    transitions = memory.sample(batch_size)
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                t = t + 1

                # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay
                else:
                    EPSILON = min_epsilon

            # Update the target network, copying all weights and biases in DQN
            if i_episode % update_frequency == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episode to results of run
        runs_results.append(episode_durations)
    learning_runs.append(runs_results)
print('Complete')

In [None]:
# ########## Save results
# import pickle
# with open ('learingRate.pkl', 'wb') as f:
#     pickle.dump(learning_runs, f) 

In [None]:
########### Plot results
results0 = torch.tensor(learning_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(learning_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(learning_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(learning_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

plt.plot(torch.arange(300), means0, label="LR = 0.001")
plt.plot(torch.arange(300), means1, label="LR = 0.0025")
plt.plot(torch.arange(300), means2, label="LR = 0.01")
plt.plot(torch.arange(300), means3, label="LR = 0.1")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Different Learning Rate Learning Curves")
# plt.savefig("hana_architectures_DiffLR_1Update.jpg")
plt.show()

#### **Hyperparameter 6:** Batch Size

In [None]:
# Defining final architecture, as chosen from above
architecture_final = [4, 64, 32, 2]
buffer_size = 60000
# This is what we vary this time - batch size
batch_size = [32, 64, 128, 256, 512]
update_frequency = 1
learning_rate = 0.0025
NUM_RUNS = 10
EPSILON = 1
epsilon_decay = 0.8
min_epsilon = 0.1
batch_runs = []

In [None]:
runs_results = []
env = gym.make('CartPole-v1')
for i in range(len(batch_size)):
    print('Batch sizes: ', batch_size[i])
    runs_results = []
    for run in range(NUM_RUNS):
        print(f"Starting run {run+1} of {NUM_RUNS}")
        # Create the main and target networks
        policy_net = DQN(architecture_final)
        target_net = DQN(architecture_final)
        # Update the target with the main
        update_target(target_net, policy_net)
        # Evaluate the target network, which is the one we want to optimize at the end of the day
        target_net.eval()
        
        # Create an SGC optimizer on the policy network parameters - CHANGE
        #optimizer = optim.adam()
        optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
        # Create a memory buffer with 1 item
        memory = ReplayBuffer(buffer_size)

        # Number of steps done per run
        steps_done = 0
        # Storing all the episodes
        episode_durations = []

        # For each episode in episode_range
        for i_episode in range(300):
            # Reset epsilon
            EPSILON = 1
            # Every 50 episodes, we say that we're at episode something out of episode_range
            if (i_episode+1) % 50 == 0:
                print("episode ", i_episode+1, "/", 300)

            # Reset the environment
            observation, info = env.reset()
            # Turn the observation into a state
            state = torch.tensor(observation).float()

            done = False
            terminated = False
            t = 0
            # While not done with episode
            while not (done or terminated):

                # Select and perform an action
                action = epsilon_greedy(EPSILON, policy_net, state)
                # Take a step
                observation, reward, done, terminated, info = env.step(action)
                # Get the reward and action as tensors
                reward = torch.tensor([reward])
                action = torch.tensor([action])
                # Define the next state as the observation but in tensor format
                next_state = torch.tensor(observation).reshape(-1).float()
                # Push the state, next state, action and etc to memory, and then whether or not it's done
                memory.push([state, action, next_state, reward, torch.tensor([done])])

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                if not len(memory.buffer) < batch_size[i]:
                    # Get a random sample from the buffer
                    transitions = memory.sample(batch_size[i])
                    # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                    # sampled transition
                    state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                    # Compute the MSE loss between the target and policy
                    mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                    # Optimize the model
                    optimizer.zero_grad()
                    mse_loss.backward()
                    optimizer.step()
                
                # If not done as in haven't reached end or terminated, append timestep
                if done or terminated:
                    episode_durations.append(t + 1)
                t = t + 1

                # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
                if EPSILON > min_epsilon:
                    EPSILON = EPSILON * epsilon_decay
                else:
                    EPSILON = min_epsilon

            # Update the target network, copying all weights and biases in DQN
            if i_episode % update_frequency == 0: 
                # Update the target
                update_target(target_net, policy_net)
        # Append episode to results of run
        runs_results.append(episode_durations)
    batch_runs.append(runs_results)
print('Complete')

In [None]:
# ######### Save results of batch size changing
# import pickle
# with open ('batchSizes.pkl', 'wb') as f:
#     pickle.dump(batch_runs, f) 

In [None]:
########## Plot results of batch
results0 = torch.tensor(batch_runs[0])
means0 = results0.float().mean(0)
stds0 = results0.float().std(0)

results1 = torch.tensor(batch_runs[1])
means1 = results1.float().mean(0)
stds1 = results1.float().std(0)

results2 = torch.tensor(batch_runs[2])
means2 = results2.float().mean(0)
stds2 = results2.float().std(0)

results3 = torch.tensor(batch_runs[3])
means3 = results3.float().mean(0)
stds3 = results3.float().std(0)

results4 = torch.tensor(batch_runs[4])
means4 = results4.float().mean(0)
stds4 = results4.float().std(0)

plt.plot(torch.arange(300), means0, label="Batch Size = 32")
plt.plot(torch.arange(300), means1, label="Batch Size = 64")
plt.plot(torch.arange(300), means2, label="Batch Size = 128")
plt.plot(torch.arange(300), means3, label="Batch Size = 256")
plt.plot(torch.arange(300), means4, label="Batch Size = 512")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means0-stds0, means0+stds0, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means1-stds1, means1+stds1, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means2-stds2, means2+stds2, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means3-stds3, means3+stds3, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means4-stds4, means4+stds4, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Different Batch Size Learning Curves")
# plt.savefig("hana_architectures_DiffBatch_1Update.jpg")
plt.show()

#### **Part 1.3: Final Parameter Learning Curve**

In [None]:
learning_rate = 0.0025
batch_size = 128
buffer_size = 60000
update_frequency = 1
architecture_final = [4, 64, 32, 2]
epsilon_decay = 0.8
NUM_RUNS = 10
EPSILON = 1
min_epsilon = 0.1
runs_results = []

In [None]:
env = gym.make('CartPole-v1')
runs_results = []
for run in range(NUM_RUNS):
    print(f"Starting run {run+1} of {NUM_RUNS}")
    # Create the main and target networks
    policy_net = DQN(architecture_final)
    target_net = DQN(architecture_final)
    # Update the target with the main
    update_target(target_net, policy_net)
    # Evaluate the target network, which is the one we want to optimize at the end of the day
    target_net.eval()
    
    # Create an SGC optimizer on the policy network parameters - CHANGE
    #optimizer = optim.adam()
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    # Create a memory buffer with 1 item
    memory = ReplayBuffer(buffer_size)

    # Number of steps done per run
    steps_done = 0
    # Storing all the episodes
    episode_durations = []

    # For each episode in episode_range
    for i_episode in range(300):
        # Reset epsilon
        EPSILON = 1
        # Every 50 episodes, we say that we're at episode something out of episode_range
        if (i_episode+1) % 50 == 0:
            print("episode ", i_episode+1, "/", 300)

        # Reset the environment
        observation, info = env.reset()
        # Turn the observation into a state
        state = torch.tensor(observation).float()

        done = False
        terminated = False
        t = 0
        # While not done with episode
        while not (done or terminated):
            
            # Select and perform an action
            action = epsilon_greedy(EPSILON, policy_net, state)
            # Take a step
            observation, reward, done, terminated, info = env.step(action)
            # Get the reward and action as tensors
            reward = torch.tensor([reward])
            action = torch.tensor([action])
            # Define the next state as the observation but in tensor format
            next_state = torch.tensor(observation).reshape(-1).float()
            # Push the state, next state, action and etc to memory, and then whether or not it's done
            memory.push([state, action, next_state, reward, torch.tensor([done])])

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            if not len(memory.buffer) < batch_size:
                # Get a random sample from the buffer
                transitions = memory.sample(batch_size)
                # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
                # sampled transition
                state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                # Compute the MSE loss between the target and policy
                mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                # Optimize the model
                optimizer.zero_grad()
                mse_loss.backward()
                optimizer.step()
            
            # If not done as in haven't reached end or terminated, append timestep
            if done or terminated:
                episode_durations.append(t + 1)
            t = t + 1

            # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
            if EPSILON > min_epsilon:
                EPSILON = EPSILON * epsilon_decay
            else:
                EPSILON = min_epsilon
        # Update the target network, copying all weights and biases in DQN
        if i_episode % update_frequency == 0: 
            # Update the target
            update_target(target_net, policy_net)
    # Append episode to results of run
    runs_results.append(episode_durations)
print('Complete')

In [None]:
# ######## Save results in a pickle file
# import pickle
# with open ('finalRun2.pkl', 'wb') as f:
#     pickle.dump(runs_results, f) 

In [None]:
# ######## Open pickle file of results
# import pickle
# with open('finalRun2.pkl', 'rb') as f:
#     final_run = pickle.load(f)

In [None]:
######### Plot the final run results
results = torch.tensor(runs_results)
means = results.float().mean(0)
stds = results.float().std(0)

plt.plot(torch.arange(300), means, label="Mean of Return")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), means-stds, means+stds, alpha=0.2, color='dimgray')

plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")
plt.title("Final Model Learning Curve")
# plt.savefig("hana_architectures_finalModel2.jpg")
plt.show()

## **Part 2: Visualizing DQN**
In this segment, we observe what the agent learns in terms of its policy for how to navigate the problem space (what action to take and when) as a function of the pole's angle and angular velocity. To do this, we do *one* training run of the DQN policy and target networks, then use the provided functions to print out what the results of training are for different cart velocities.

#### **Training the Network once more**

In [None]:
learning_rate = 0.0025
batch_size = 128
buffer_size = 60000
update_frequency = 1
architecture_final = [4, 64, 32, 2]
epsilon_decay = 0.8
EPSILON = 1
NUM_RUNS = 10
min_epsilon = 0.1
runs_results = []

env = gym.make('CartPole-v1')
# Create the main and target networks
policy_net = DQN(architecture_final)
target_net = DQN(architecture_final)
# Update the target with the main
update_target(target_net, policy_net)
# Evaluate the target network, which is the one we want to optimize at the end of the day
target_net.eval()

# Create an SGC optimizer on the policy network parameters - CHANGE
#optimizer = optim.adam()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
# Create a memory buffer with 1 item
memory = ReplayBuffer(buffer_size)

# Number of steps done per run
steps_done = 0
# Storing all the episodes
episode_durations = []

# For each episode in episode_range
for i_episode in range(300):
    # Reset epsilon
    EPSILON = 1
    # Every 50 episodes, we say that we're at episode something out of episode_range
    if (i_episode+1) % 50 == 0:
        print("episode ", i_episode+1, "/", 300)

    # Reset the environment
    observation, info = env.reset()
    # Turn the observation into a state
    state = torch.tensor(observation).float()

    done = False
    terminated = False
    t = 0
    # While not done with episode
    while not (done or terminated):

        # Select and perform an action
        action = epsilon_greedy(EPSILON, policy_net, state)
        # Take a step
        observation, reward, done, terminated, info = env.step(action)
        # Get the reward and action as tensors
        reward = torch.tensor([reward])
        action = torch.tensor([action])
        # Define the next state as the observation but in tensor format
        next_state = torch.tensor(observation).reshape(-1).float()
        # Push the state, next state, action and etc to memory, and then whether or not it's done
        memory.push([state, action, next_state, reward, torch.tensor([done])])

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        if not len(memory.buffer) < batch_size:
            # Get a random sample from the buffer
            transitions = memory.sample(batch_size)
            # torch.stack() concatenates a sequence of tensors along a new dimension. Here, we concatenate all the information of the
            # sampled transition
            state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
            # Compute the MSE loss between the target and policy
            mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
            # Optimize the model
            optimizer.zero_grad()
            mse_loss.backward()
            optimizer.step()
        
        # If not done as in haven't reached end or terminated, append timestep
        if done or terminated:
            episode_durations.append(t + 1)
        t = t + 1

        # SET MINIMUM EPSILON SO IT ALWAYS HAS EXPLORATION
        if EPSILON > min_epsilon:
            EPSILON = EPSILON * epsilon_decay
        else:
            EPSILON = min_epsilon
    # Update the target network, copying all weights and biases in DQN
    if i_episode % update_frequency == 0: 
        # Update the target
        update_target(target_net, policy_net)
print('Complete')

In [None]:
# ######## Saving results of one run
# import pickle
# with open ('oneRun.pkl', 'wb') as f:
#     pickle.dump(episode_durations, f) 

In [None]:
# ######## Saving policy of one run
# with open ('oneRunPolicy.pkl', 'wb') as f:
#     pickle.dump(policy_net, f)

In [None]:
# ######## Loading up the policy of one run
# with open('oneRunPolicy.pkl', 'rb') as f:
#     policy_net = pickle.load(f)

#### **Part 2.1: Visualizing Slices of Greedy Policy**

In [None]:
# Visualising the greedy Q-values for a stationary cart in the middle of the track
# 2D plot showing policy as a function of pole angle and angular velocity (omega)

# This plots the policy and Q values according to the network currently
# stored in the variable "policy_net"

# All visualisations provided here are placeholders and you can modify these plots

# Make sure to include appropriate labels and/or legends when presenting your plot

q = False    # whether q values or greedy policy is visualised

angle_range = .2095 # you may modify this range
omega_range = 5     # you may modify this range

angle_samples = 100
omega_samples = 100
angles = torch.linspace(angle_range, -angle_range, angle_samples)
omegas = torch.linspace(-omega_range, omega_range, omega_samples)
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.5)
velocities = [0, 0.5, 1, 2]
plt.suptitle("Greedy Policy Visualization for Different Velocities, Yellow = Right, Blue = Left")

for iter, velo in enumerate(velocities):
    # Create the subplot axes for each iteration in the velocities array
    ax = plt.subplot(3, 2, iter + 1)
    # Create the q-value and policy tensor as empty tensors of size (angles, angular velocities)
    greedy_q_array = torch.zeros((angle_samples, omega_samples))
    policy_array = torch.zeros((angle_samples, omega_samples))
    # For each angle
    for i, angle in enumerate(angles):
        # For each angular velocity
        for j, omega in enumerate(omegas):
            # Define the state as a tensor
            state = torch.tensor([0., velo, angle, omega])
            with torch.no_grad():
                # Get the q values, the greedy action as the index of the maximum of the q values, then populate arrays
                q_vals = policy_net(state)
                greedy_action = q_vals.argmax()
                greedy_q_array[i, j] = q_vals[greedy_action]
                policy_array[i, j] = greedy_action
    # If Q value is what's being visualized, visualize with color bar
    if q:
        qValueFig = ax.contourf(angles, omegas, greedy_q_array.T, cmap='cividis', levels=100)
        fig.colorbar(qValueFig)
    # If greedy policy is what's being visualized, visualize the contour without colorbar
    else:
        policyFig = ax.contourf(angles, omegas, policy_array.T, cmap='cividis')
    
    ax.set_title("Cart velocity of {}".format(velo))
    ax.set_xlabel("pole angle")
    ax.set_ylabel("pole angular velocity")

#### **Part 2.2: Visualizing Slices of Q Function**

In [None]:
# Visualising the greedy Q-values for a stationary cart in the middle of the track
# 2D plot showing policy as a function of pole angle and angular velocity (omega)

# This plots the policy and Q values according to the network currently
# stored in the variable "policy_net"

# All visualisations provided here are placeholders and you can modify these plots

# Make sure to include appropriate labels and/or legends when presenting your plot

q = True    # whether q values or greedy policy is visualised

angle_range = .2095 # you may modify this range
omega_range = 4     # you may modify this range

angle_samples = 100
omega_samples = 100
angles = torch.linspace(angle_range, -angle_range, angle_samples)
omegas = torch.linspace(-omega_range, omega_range, omega_samples)
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.5)
velocities = [0, 0.5, 1, 2]
plt.suptitle("Greedy Q-Value Visualization for Different Velocities, Yellow = Right, Blue = Left")

for iter, velo in enumerate(velocities):
    # Create the subplot axes for each iteration in the velocities array
    ax = plt.subplot(3, 2, iter + 1)
    # Create the q-value and policy tensor as empty tensors of size (angles, angular velocities)
    greedy_q_array = torch.zeros((angle_samples, omega_samples))
    policy_array = torch.zeros((angle_samples, omega_samples))
    # For each angle
    for i, angle in enumerate(angles):
        # For each angular velocity
        for j, omega in enumerate(omegas):
            # Define the state as a tensor
            state = torch.tensor([0., velo, angle, omega])
            with torch.no_grad():
                # Get the q values, the greedy action as the index of the maximum of the q values, then populate arrays
                q_vals = policy_net(state)
                greedy_action = q_vals.argmax()
                greedy_q_array[i, j] = q_vals[greedy_action]
                policy_array[i, j] = greedy_action
    # If Q value is what's being visualized, visualize with color bar
    if q:
        qValueFig = ax.contourf(angles, omegas, greedy_q_array.T, cmap='cividis', levels=100)
        fig.colorbar(qValueFig)
    # If greedy policy is what's being visualized, visualize the contour without colorbar
    else:
        policyFig = ax.contourf(angles, omegas, policy_array.T, cmap='cividis')
    
    ax.set_title("Cart velocity of {}".format(velo))
    ax.set_xlabel("pole angle")
    ax.set_ylabel("pole angular velocity")


## **Part 3: Transforming DQN into DDQN**
In the final segment, we convert the DQN into a DDQN. This can be done by changing the loss function appropriately. Ultimately, what we want to do is to be able to decouple the process of picking an action from evaluating or estimating the value of that action, in order to prevent overestimation inherent to DQNs.
(Note that for my specific code, I compare DDQN with DQN using the finalRun2.pkl file, which contains the list of run results from the 10-run learning curve used in Part 1.3. Thus, to properly run the code, please make sure to run Part 1.3, and change the variable in the last plotting function to <i>run_results</i> rather than <i>dqn_runs</i>)

In [None]:
learning_rate = 0.0025
batch_size = 128
buffer_size = 60000
update_frequency = 1
architecture_final = [4, 64, 32, 2]
epsilon_decay = 0.8
EPSILON = 1
NUM_RUNS = 10
min_epsilon = 0.1
# New array to store DDQN run results
ddqn_runs = []  

env = gym.make('CartPole-v1')
# For each run
for run in range(NUM_RUNS):
    print(f"Starting run {run+1} of {NUM_RUNS}")
    # Create the networks
    policy_net = DQN(architecture_final)
    target_net = DQN(architecture_final)
    update_target(target_net, policy_net)
    target_net.eval()
    
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    memory = ReplayBuffer(buffer_size)

    steps_done = 0

    episode_durations = []
    
    for i_episode in range(300):
        
        if (i_episode+1) % 50 == 0:
            print("episode ", i_episode+1, "/", 300)

        # Reset epsilon at each episode, and observe the state from reset
        EPSILON = 1
        observation, info = env.reset()
        state = torch.tensor(observation).float()

        done = False
        terminated = False
        t = 0    

        while not (done or terminated):
            # Select and perform an action from the policy network
            action = epsilon_greedy(EPSILON, policy_net, state)
            # Proceed as before, in which you obtain the observation, states, etc
            observation, reward, done, terminated, info = env.step(action)
            reward = torch.tensor([reward])
            action = torch.tensor([action])
            next_state = torch.tensor(observation).reshape(-1).float()
            # Push all the information into memory
            memory.push([state, action, next_state, reward, torch.tensor([done])])

            # Move to the next state
            state = next_state
            # Set boolean that we are finding ddqn
            ddqn = True

            # Perform one step of the optimization (on the policy network)
            if not len(memory.buffer) < batch_size:
                transitions = memory.sample(batch_size)
                state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                # Compute loss, this time with the ddqn flag
                mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones, ddqn)
                # Optimize the model
                optimizer.zero_grad()
                mse_loss.backward()
                optimizer.step()
            
            if done or terminated:
                episode_durations.append(t + 1)
            t = t + 1
            
            # Update epsilon at the end of each step
            if EPSILON > min_epsilon:
                EPSILON = EPSILON * epsilon_decay
                
            else:
                EPSILON = min_epsilon
                
        # Update the target network, copying all weights and biases in DQN
        if i_episode % update_frequency == 0: 
            update_target(target_net, policy_net)

    # Append the current episode's results to the DDQN runs
    ddqn_runs.append(episode_durations)

print('Completed runs')

In [None]:
# ######## Save results of the DDQN run
# import pickle
# with open ('ddqn2.pkl', 'wb') as f:
#     pickle.dump(ddqn_runs, f)

In [None]:
# ####### Load results of DQN final runs
# import pickle
# with open('finalRun2.pkl', 'rb') as f:
#     dqn_runs = pickle.load(f)

In [None]:
######### Plotting results of DQN vs DDQN
ddqn_results = torch.tensor(ddqn_runs)
ddqn_mean = ddqn_results.float().mean(0)
ddqn_std = ddqn_results.float().std(0)

results_DQN = torch.tensor(dqn_runs)
means_DQN = results_DQN.float().mean(0)
stds_DQN = results_DQN.float().std(0)

# means_avg = results2.float().mean(0)

plt.plot(torch.arange(300), ddqn_mean, label="DDQN")
plt.plot(torch.arange(300), means_DQN, label="DQN")
plt.ylabel("Return")
plt.xlabel("Episode")
plt.fill_between(np.arange(300), ddqn_mean, ddqn_mean+ddqn_std, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), ddqn_mean, ddqn_mean-ddqn_std, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means_DQN, means_DQN+stds_DQN, alpha=0.2, color='dimgray')
plt.fill_between(np.arange(300), means_DQN, means_DQN-stds_DQN, alpha=0.2, color='dimgray')
plt.axhline(y=100, linestyle='--', color='r')
plt.legend(loc="upper left")

plt.title("Learning curves for the DQN and DDQN")
# plt.savefig("ddqn_lc.jpg")
