# Environment setup

## Install neccessary tools, libraries, etc.

In [1]:
!sudo apt-get update
!apt install imagemagick

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2,389 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [2,462 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-securi

In [37]:
!pip -q install gymnasium pygame gymnasium[mujoco]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import important libraries

In [49]:
from __future__ import annotations

import os
from collections import deque
from IPython.display import Image

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from tqdm.notebook import tqdm
import pandas as pd
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions.normal import Normal

In [39]:
seed = 777

torch.manual_seed(seed)
np.random.seed(seed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Config

In [4]:
env_id = {
    'Cart Pole': 'CartPole-v1',
    'Frozen Lake': 'FrozenLake-v1',
    'Taxi': 'Taxi-v3'
}
render_mode = 'rgb_array'

In [5]:
n_training_episodes = 30000
n_eval_episodes = 100
lr = 0.05
max_steps = 99
gamma = 0.95
eval_seed = range(n_eval_episodes)
min_epsilon, max_epsilon = 0.05, 1.0
decay_rate = 0.0005

# Experiment Setup

## Util

In [None]:
"""
    The animation is difficult to observe on Colab,
    thus this function views each environment step as a frame, combines those frames,
    and saves as a.gif file to enable easy viewing of the animation on Colab.
"""

def save_frames_as_gif(frames, path='/content', filename='example.gif', fps=1):
    temp_frame = frames[0]
    plt.figure(figsize=(temp_frame.shape[1]/72.0, temp_frame.shape[0]/72.0), dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        path.set_data(frames[i])

    anm = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
    anm.save(os.path.join(path, filename), fps=fps)
    plt.close()

"""
Note: I discourage using this because Colab collapses when the function calls
the environment to render a frame after several calls.
"""


## Model

In [22]:
env = gym.make(env_id['Frozen Lake'], render_mode=render_mode)
# env = gym.wrappers.vector.ClipReward(env, min_reward=0.2, max_reward=0.8)

print(f"The environment's observation space: {env.observation_space.n}")
print(f"The environment's action space: {env.action_space.n}")

The environment's observation space: 16
The environment's action space: 4


In [34]:
print(env.action_space.sample())
print(env.reset(seed=10))

3
(0, {'prob': 1})


## Q-learning

Q Learning is a Lookup-Table-based approach that uses a Q-table of State-Action Values (Q-values) to estimate the corresponding state-action pairs. The Q-values are initialized to zero and iteratively improved as the agent interacts with the environment and receives feedback. The algorithm updates these Q-values using the Bellman equation, ensuring they converge to the optimal ones. This approach is particularly interesting as it is the basis of Deep Q Learning.

In [9]:
from typing import Union

def greedy_policy(q_table, state, epsilon:Union[float, None]=None):
    if epsilon == None:
        action = np.argmax(q_table[state, :])
    else:
        rand_no = float(np.random.uniform(0, 1))
        if rand_no > epsilon:
            action = np.argmax(q_table[state, :])
        else:
            action = np.random.choice(q_table.shape[1])
    return action

In [None]:
def init_q_table(state_space, action_space):
    q_table = np.zeros((state_space, action_space))
    return q_table

In [None]:
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = init_q_table(state_space, action_space)
before_q_table = q_table
epsilon = None
q_learning_images_epsilon_none = []

for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    state, info = env.reset()
    step = 0

    for step in range(max_steps):
        action = greedy_policy(q_table, state, epsilon=epsilon)
        new_state, reward, terminated, truncated, info = env.step(action)
        q_table[state, action] = q_table[state, action] + lr*(reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
        # q_learning_images_epsilon_none.append(env.render())
        if terminated or truncated:
            break

        state = new_state


print(f'Training with epsilon greedy initiates at {epsilon}')
print(f'Q-Table before:\n{init_q_table(state_space, action_space)}')
print(f'Q-Table After:\n{q_table}')
# save_filename = 'q_learning_images_epsilon_none.gif'
# save_frames_as_gif (q_learning_images_epsilon_none,
#                     path='/content',
#                     filename=save_filename,
#                     fps =10)
# Image(open('demo.gif','rb').read ())

  0%|          | 0/30000 [00:00<?, ?it/s]

Training with epsilon greedy initiates at 0.05000029075254441
Q-Table before:
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Q-Table After:
[[  0.           0.           0.           0.           0.
    0.        ]
 [  0.74722768   2.27947784   0.75848367   1.95097893   5.20997639
   -7.13174657]
 [  6.02885382   7.32180602   2.93608228   6.41087599  10.9512375
   -1.30628473]
 ...
 [ -1.10432583  10.29976753  -1.08195157   0.26693797  -7.15287836
   -5.83983927]
 [ -2.66472386  -2.63862362  -2.63310423   4.544548   -10.14954683
   -8.97813663]
 [  1.36098806   3.31400494   1.87789431  17.96576465   0.59063024
   -1.86249964]]


In [None]:
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = init_q_table(state_space, action_space)
before_q_table = q_table
epsilon = 0.5
q_learning_images_epsilon_05 = []

for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    state, info = env.reset()
    step = 0

    for step in range(max_steps):
        action = greedy_policy(q_table, state, epsilon=epsilon)
        new_state, reward, terminated, truncated, info = env.step(action)
        q_table[state, action] = q_table[state, action] + lr*(reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
        # q_learning_images_epsilon_05.append(env.render())
        if terminated or truncated:
            break

        state = new_state


print(f'Training with epsilon greedy initiates at {epsilon}')
print(f'Q-Table before:\n{init_q_table(state_space, action_space)}')
print(f'Q-Table After:\n{q_table}')
# save_filename = 'q_learning_images_epsilon_05.gif'
# save_frames_as_gif (q_learning_images_epsilon_05,
#                     path='/content',
#                     filename=save_filename,
#                     fps =10)
# Image(open('demo.gif','rb').read ())

  0%|          | 0/30000 [00:00<?, ?it/s]

Training with epsilon greedy initiates at 0.05000029075254441
Q-Table before:
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Q-Table After:
[[  0.           0.           0.           0.           0.
    0.        ]
 [  0.66938787   2.15679811   0.57003801   1.01800549   5.20997639
   -6.50479762]
 [  6.38044456   8.04145112   5.14868489   7.25307631  10.9512375
   -1.76737692]
 ...
 [ -0.91842877  11.7060938   -0.92952945  -0.52418072  -5.26929502
   -5.21902859]
 [ -2.57920903  -2.7005574   -2.42568169   4.26247069 -10.11836076
  -10.01174279]
 [  2.83790166   2.04170501   1.92067735  17.94750897  -1.38158241
   -0.15347007]]


## REINFORCE

In [44]:
class Policy_Network(nn.Module):
    """Parametrized Policy Network."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes a neural network that estimates the mean and standard deviation
         of a normal distribution from which an action is sampled from.

        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """
        super().__init__()

        hidden_space1 = 16  # Nothing special with 16, feel free to change
        hidden_space2 = 32  # Nothing special with 32, feel free to change

        # Shared Network
        self.shared_net = nn.Sequential(
            nn.Linear(obs_space_dims, hidden_space1),
            nn.Tanh(),
            nn.Linear(hidden_space1, hidden_space2),
            nn.Tanh(),
        )

        # Policy Mean specific Linear Layer
        self.policy_mean_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims)
        )

        # Policy Std Dev specific Linear Layer
        self.policy_stddev_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims)
        )

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Conditioned on the observation, returns the mean and standard deviation
         of a normal distribution from which an action is sampled from.

        Args:
            x: Observation from the environment

        Returns:
            action_means: predicted mean of the normal distribution
            action_stddevs: predicted standard deviation of the normal distribution
        """
        shared_features = self.shared_net(x.float())

        action_means = self.policy_mean_net(shared_features)
        action_stddevs = torch.log(
            1 + torch.exp(self.policy_stddev_net(shared_features))
        )

        return action_means, action_stddevs

In [45]:
class REINFORCE:
    """REINFORCE algorithm."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes an agent that learns a policy via REINFORCE algorithm [1]
        to solve the task at hand (Inverted Pendulum v4).

        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """

        # Hyperparameters
        self.learning_rate = 1e-4  # Learning rate for policy optimization
        self.gamma = 0.99  # Discount factor
        self.eps = 1e-6  # small number for mathematical stability

        self.probs = []  # Stores probability values of the sampled action
        self.rewards = []  # Stores the corresponding rewards

        self.net = Policy_Network(obs_space_dims, action_space_dims)
        self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=self.learning_rate)

    def sample_action(self, state: np.ndarray) -> float:
        """Returns an action, conditioned on the policy and observation.

        Args:
            state: Observation from the environment

        Returns:
            action: Action to be performed
        """
        state = torch.tensor(np.array([state]))
        action_means, action_stddevs = self.net(state)

        # create a normal distribution from the predicted
        #   mean and standard deviation and sample an action
        distrib = Normal(action_means[0] + self.eps, action_stddevs[0] + self.eps)
        action = distrib.sample()
        prob = distrib.log_prob(action)

        action = action.numpy()

        self.probs.append(prob)

        return action

    def update(self):
        """Updates the policy network's weights."""
        running_g = 0
        gs = []

        # Discounted return (backwards) - [::-1] will return an array in reverse
        for R in self.rewards[::-1]:
            running_g = R + self.gamma * running_g
            gs.insert(0, running_g)

        deltas = torch.tensor(gs)

        loss = 0
        # minimize -1 * prob * reward obtained
        for log_prob, delta in zip(self.probs, deltas):
            loss += log_prob.mean() * delta * (-1)

        # Update the policy network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Empty / zero out all episode-centric/related variables
        self.probs = []
        self.rewards = []


In [None]:
# Create and wrap the environment
env = gym.make("InvertedPendulum-v4")
wrapped_env = gym.wrappers.RecordEpisodeStatistics(env, 50)  # Records episode-reward

total_num_episodes = int(5e3)  # Total number of episodes
# Observation-space of InvertedPendulum-v4 (4)
obs_space_dims = env.observation_space.shape[0]
# Action-space of InvertedPendulum-v4 (1)
action_space_dims = env.action_space.shape[0]
rewards_over_seeds = []


agent = REINFORCE(obs_space_dims, action_space_dims)
reward_over_episodes = []

for episode in range(total_num_episodes):
    # gymnasium v26 requires users to set seed while resetting the environment
    obs, info = wrapped_env.reset(seed=seed)
    done = False
    while not done:
        action = agent.sample_action(obs)

        # Step return type - `tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]`
        # These represent the next observation, the reward from the step,
        # if the episode is terminated, if the episode is truncated and
        # additional info from the step
        obs, reward, terminated, truncated, info = wrapped_env.step(action)
        agent.rewards.append(reward)

        # End the episode when either truncated or terminated is true
        #  - truncated: The episode duration reaches max number of timesteps
        #  - terminated: Any of the state space values is no longer finite.
        done = terminated or truncated

    reward_over_episodes.append(wrapped_env.return_queue[-1])
    agent.update()

    if episode % 1000 == 0:
        avg_reward = int(np.mean(wrapped_env.return_queue))
        print("Episode:", episode, "Average Reward:", avg_reward)

rewards_over_seeds.append(reward_over_episodes)

Episode: 0 Average Reward: 6
Episode: 1000 Average Reward: 17
Episode: 2000 Average Reward: 78
Episode: 3000 Average Reward: 190


In [None]:
rewards_to_plot = [[reward[0] for reward in rewards] for rewards in rewards_over_seeds]
df1 = pd.DataFrame(rewards_to_plot).melt()
df1.rename(columns={"variable": "episodes", "value": "reward"}, inplace=True)
sns.set(style="darkgrid", context="talk", palette="rainbow")
sns.lineplot(x="episodes", y="reward", data=df1).set(
    title="REINFORCE for InvertedPendulum-v4"
)
plt.show()