In [12]:
!pip3 install gymnasium[classic-control]
!pip3 install renderlab
!pip3 install opencv-python
!pip install gymnasium[box2d]


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
from __future__ import annotations

from collections import defaultdict
import pickle
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym
import renderlab as rl

error: XDG_RUNTIME_DIR not set in the environment.


In [3]:
def zerofactory():
    return np.zeros(env.action_space.n)

class MoonlanderAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = defaultdict(zerofactory)

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []


    
    def get_action(self, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)

def load(path):
    with open(path, "br") as f:
        return pickle.load(f)        

def discretize(obs, decimal):
    return np.round(obs, decimals = decimal)

def save(obj, path):
    with open(path, "bw") as f:
        pickle.dump(obj, f)


In [4]:
env = gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,render_mode="rgb_array"
)

learning_rate = 0.01
n_episodes = 100000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1
num_chunks = 10
chunk_length = int(n_episodes/num_chunks)
rewards = np.zeros(chunk_length)


agent = MoonlanderAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)
#agent = load("./checkpoint-Moonlander")

env = gym.wrappers.RecordEpisodeStatistics(env)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    obs = discretize(obs, 1)
    done = False

    # play one episode
    while not done:
        action = agent.get_action(tuple(obs))
        next_obs, reward, terminated, truncated, info = env.step(action)
        next_obs = discretize(next_obs, 1)
        # update the agent
        agent.update(tuple(obs), action, reward, terminated, tuple(next_obs))

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()
    rewards[episode%chunk_length] = info["episode"]["r"]
    if episode%chunk_length == 0 and episode > 0:
        print(f"average reward over the last {chunk_length}: {np.average(rewards)}")
        print(f"Max reward over the last {chunk_length}: {np.max(rewards)}")
        save(agent, "./checkpoint-Moonlander")


save(agent, "./checkpoint-Moonlander")
env = rl.RenderFrame(env, "./out")

for _ in range(5):
    done = False
    
    obs, info = env.reset()
    obs = discretize(obs, 1)
    while not done:
        action = agent.get_action(tuple(obs))
        next_obs, reward, terminated, truncated, info = env.step(action)
        next_obs = discretize(next_obs, 1)
        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs
env.close()


 10%|███▍                               | 9999/100000 [14:51<2:17:01, 10.95it/s]

average reward over the last 10000: -163.83796710338592
Max reward over the last 10000: 105.265380859375


 20%|██████▊                           | 19999/100000 [33:07<2:25:26,  9.17it/s]

average reward over the last 10000: -159.49313065757752
Max reward over the last 10000: 124.91085815429688


 30%|██████████▏                       | 30000/100000 [55:54<4:05:54,  4.74it/s]

average reward over the last 10000: -174.69609139289855
Max reward over the last 10000: 119.8389892578125


 40%|████████████▊                   | 40000/100000 [1:24:09<2:25:56,  6.85it/s]

average reward over the last 10000: -185.63490905685424
Max reward over the last 10000: 110.0551528930664


 40%|████████████▊                   | 40000/100000 [1:27:39<2:11:29,  7.60it/s]


MemoryError: 