In [None]:
from utils import DQN, ReplayBuffer, greedy_action, epsilon_greedy, update_target, loss

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import math
import numpy as np

import gym
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
NUM_RUNS = 10
NUM_EPISODES = 300

epsilon_start = 1
epsilon_decay = 0.992

alpha = 0.001

architecture = [4, 64, 64, 2]
batch_size = 32
buffer_size = 20000
target_net_update_freq = 4
optimizer_update_freq = 4

In [None]:
runs_results = []
env = gym.make('CartPole-v1')

# Running the algorithm for NUM_RUNS times
for run in range(NUM_RUNS):
    
    print(f"Starting run {run+1} of {NUM_RUNS}")
    
    # Initialize the policy and target network
    policy_net = DQN(architecture)
    target_net = DQN(architecture)
    update_target(target_net, policy_net)
    target_net.eval()

    # Initialize epsilon, optimizer and memory
    epsilon = epsilon_start
    optimizer = optim.Adam(policy_net.parameters(), lr=alpha)
    memory = ReplayBuffer(buffer_size)

    # Initialize the counters
    steps_done = 0
    episode_durations = []

    # Running the episodes
    for i_episode in range(NUM_EPISODES):
        
        if (i_episode+1) % 50 == 0:
            print("episode ", i_episode+1, "/", 300)

        # Reset the environment
        observation, info = env.reset()
        state = torch.tensor(observation).float()
        done = False
        terminated = False
        t = 0

        # Running the steps
        while not (done or terminated):

            # Select and perform an action
            action = epsilon_greedy(epsilon, policy_net, state)

            # Perform the action and observe new state and reward
            observation, reward, done, terminated, info = env.step(action)
            reward = torch.tensor([reward])
            action = torch.tensor([action])
            next_state = torch.tensor(observation).reshape(-1).float()

            # Store the transition in memory
            memory.push([state, action, next_state, reward, torch.tensor([done])])

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            if len(memory.buffer) >= batch_size and steps_done % optimizer_update_freq == 0:
                transitions = memory.sample(batch_size)
                state_batch, action_batch, nextstate_batch, reward_batch, dones = (torch.stack(x) for x in zip(*transitions))
                # Compute loss
                mse_loss = loss(policy_net, target_net, state_batch, action_batch, reward_batch, nextstate_batch, dones)
                # Optimize the model
                optimizer.zero_grad()
                mse_loss.backward()
                optimizer.step()
            
            # Update the step counter
            if done or terminated:
                episode_durations.append(t + 1)
            t += 1
            steps_done += 1

        # Update the target network, copying all weights and biases in DQN
        if i_episode % target_net_update_freq == 0: 
            update_target(target_net, policy_net)

        # Update epsilon
        epsilon *= epsilon_decay
    
    # Append the results of the run
    runs_results.append(episode_durations)
    
print('Complete')

In [None]:
# Plotting the learning curve
# Placeholder plot, you are free to modify it
 
results = torch.tensor(runs_results)
means = results.float().mean(0)
stds = results.float().std(0)
average = means[-101:-1].mean()
print(f'Average reward over the last 100 episodes: {average}')

plt.figure(figsize=(8,5))
plt.plot(torch.arange(NUM_EPISODES), means, linewidth=0.5, label='Mean')
plt.title("Learning Curve of the DQN agent", fontsize=14)
plt.ylabel("Return", fontsize=12)
plt.xlabel("Episode", fontsize=12)
plt.fill_between(np.arange(NUM_EPISODES), means, means+stds, alpha=0.3, color='b', label="Standard Deviation")
plt.fill_between(np.arange(NUM_EPISODES), means, means-stds, alpha=0.3, color='b')
plt.axhline(y=100, color='r', linestyle='--', label="Return threshold", linewidth=0.8)
plt.axhline(y=average, color='g', linestyle='--', label=f"Mean final returns", linewidth=0.8)
plt.legend(loc="upper left", fontsize=10)
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Visualising the greedy Q-values for a stationary cart in the middle of the track
# 2D plot showing policy as a function of pole angle and angular velocity (omega)

# This plots the policy and Q values according to the network currently
# stored in the variable "policy_net"

# All visualisations provided here are placeholders and you can modify these plots

# Make sure to include appropriate labels and/or legends when presenting your plot

# policy_net = DQN([4,2])   # randomly initialised, replace with your trained DQN
# q = True    # whether q values or greedy policy is visualised

angle_range = .2095 # you may modify this range
omega_range = 1     # you may modify this range

angle_samples = 100
omega_samples = 100
angles = torch.linspace(angle_range, -angle_range, angle_samples)
omegas = torch.linspace(-omega_range, omega_range, omega_samples)
velocities = [0, 0.5, 1, 2]

greedy_q_array = torch.zeros((angle_samples, omega_samples))
policy_array = torch.zeros((angle_samples, omega_samples))

for velocity in velocities:
    for i, angle in enumerate(angles):
        for j, omega in enumerate(omegas):
            state = torch.tensor([0., velocity, angle, omega])
            with torch.no_grad():
                q_vals = policy_net(state)
                greedy_action = q_vals.argmax()
                greedy_q_array[i, j] = q_vals[greedy_action]
                policy_array[i, j] = greedy_action
    
    # Plotting the greedy policy
    plt.contourf(angles, omegas, policy_array.T, cmap='cividis')
    # Creating custom legend patches
    left_patch = mpatches.Patch(color='blue', label='Action 0')
    right_patch = mpatches.Patch(color='yellow', label='Action 1')
    plt.title(f"Policy for cart velocity = {velocity}")
    plt.xlabel("Pole angle")
    plt.ylabel("Pole angular velocity")
    plt.legend(handles=[left_patch, right_patch], loc='upper right')
    plt.show()
    
    # Plotting the greedy Q-values
    contour_q = plt.contourf(angles, omegas, greedy_q_array.T, cmap='cividis', levels=100)
    # Adding a colorbar as a legend
    colorbar = plt.colorbar(contour_q)
    colorbar.set_label('Magnitude')
    plt.title(f"Q-values for cart velocity = {velocity}")
    plt.xlabel("Pole angle")
    plt.ylabel("Pole angular velocity")
    plt.show()