## Initialization

In [1]:
 # Requires restart
!pip install "numpy<1.24"

Collecting numpy<1.24
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.25.2
    Uninstalling numpy-1.25.2:
      Successfully uninstalled numpy-1.25.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.86 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
pandas-stubs 2.0.3.230814 requires numpy>=1.25.0; python_version >= "3.9", but you have numpy 1.23.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.23.5


In [2]:
!pip install swig
!pip install gymnasium[box2d]
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.0-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.1/182.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━

In [3]:
import math
from math import atan2, cos, sin, sqrt
from typing import Optional, Union

import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.box2d.car_dynamics import Car
from gymnasium.error import DependencyNotInstalled, InvalidAction
from gymnasium.utils import EzPickle

import csv
import time
import os

SEED = 0

import numpy as np
import torch
import torch.nn as nn
from torch.nn.functional import relu

from stable_baselines3 import PPO, SAC, A2C, DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor

import matplotlib.pyplot as plt
import pandas as pd


torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


##Environment

In [4]:
__credits__ = ["Andrea PIERRÉ"]


try:
    import Box2D
    from Box2D.b2 import contactListener, fixtureDef, polygonShape
except ImportError as e:
    raise DependencyNotInstalled(
        "Box2D is not installed, run `pip install gymnasium[box2d]`"
    ) from e

try:
    # As pygame is necessary for using the environment (reset and step) even without a render mode
    #   therefore, pygame is a necessary import for the environment.
    import pygame
    from pygame import gfxdraw
except ImportError as e:
    raise DependencyNotInstalled(
        "pygame is not installed, run `pip install gymnasium[box2d]`"
    ) from e


STATE_W = 96  # less than Atari 160x192
STATE_H = 96
VIDEO_W = 600
VIDEO_H = 400
WINDOW_W = 800
WINDOW_H = 600

SCALE = 6.0  # Track scale
TRACK_RAD = 900 / SCALE  # Track is heavily morphed circle with this radius
PLAYFIELD = 2000 / SCALE  # Game over boundary
FPS = 50  # Frames per second
ZOOM = 2.7  # Camera zoom
ZOOM_FOLLOW = True  # Set to False for fixed view (don't use zoom)


TRACK_DETAIL_STEP = 21 / SCALE
TRACK_TURN_RATE = 0.31
TRACK_WIDTH = 40 / SCALE
BORDER = 8 / SCALE
BORDER_MIN_COUNT = 4
GRASS_DIM = PLAYFIELD / 20.0
MAX_SHAPE_DIM = (
    max(GRASS_DIM, TRACK_WIDTH, TRACK_DETAIL_STEP) * math.sqrt(2) * ZOOM * SCALE
)

#Prev errors for CTE variance calc
NUM_PREV_ERRORS = 20

# Reward constants
CTE_RESCALE = 200
REWARD_VSHIFT = 10
OFF_ROAD_PENALTY = -1000

VARIABLE_SPEED ={"On" : True, "min_speed": 30, "max_speed": 70}

'''https://www.desmos.com/calculator/dtotkkusih'''

class FrictionDetector(contactListener):
    def __init__(self, env, lap_complete_percent):
        contactListener.__init__(self)
        self.env = env
        self.lap_complete_percent = lap_complete_percent

    def BeginContact(self, contact):
        self._contact(contact, True)

    def EndContact(self, contact):
        self._contact(contact, False)

    def _contact(self, contact, begin):
        tile = None
        obj = None
        u1 = contact.fixtureA.body.userData
        u2 = contact.fixtureB.body.userData
        if u1 and "road_friction" in u1.__dict__:
            tile = u1
            obj = u2
        if u2 and "road_friction" in u2.__dict__:
            tile = u2
            obj = u1
        if not tile:
            return

        # inherit tile color from env
        tile.color[:] = self.env.road_color
        if not obj or "tiles" not in obj.__dict__:
            return
        if begin:
            obj.tiles.add(tile)
            if not tile.road_visited:
                tile.road_visited = True
                self.env.reward += 0 #/ len(self.env.track)
                self.env.tile_visited_count += 1

                # Lap is considered completed if enough % of the track was covered
                if (#tile.idx == 0 and
                    (self.env.tile_visited_count / len(self.env.track)) > self.lap_complete_percent):
                    self.env.new_lap = True
        else:
            obj.tiles.remove(tile)


class CarRacing(gym.Env, EzPickle):
    """
    ## Description
    The easiest control task to learn from pixels - a top-down
    racing environment. The generated track is random every episode.

    Some indicators are shown at the bottom of the window along with the
    state RGB buffer. From left to right: true speed, four ABS sensors,
    steering wheel position, and gyroscope.
    To play yourself (it's rather fast for humans), type:
    ```shell
    python gymnasium/envs/box2d/car_racing.py
    ```
    Remember: it's a powerful rear-wheel drive car - don't press the accelerator
    and turn at the same time.

    ## Action Space
    If continuous there are 3 actions :
    - 0: steering, -1 is full left, +1 is full right
    - 1: gas
    - 2: breaking

    If discrete there are 5 actions:
    - 0: do nothing
    - 1: steer left
    - 2: steer right
    - 3: gas
    - 4: brake

    ## Observation Space

    A top-down 96x96 RGB image of the car and race track.

    ## Rewards
    The reward is -0.1 every frame and +1000/N for every track tile visited, where N is the total number of tiles
     visited in the track. For example, if you have finished in 732 frames, your reward is 1000 - 0.1*732 = 926.8 points.

    ## Starting State
    The car starts at rest in the center of the road.

    ## Episode Termination
    The episode finishes when all the tiles are visited. The car can also go outside the playfield -
     that is, far off the track, in which case it will receive -100 reward and die.

    ## Arguments

    ```python
    >>> import gymnasium as gym
    >>> env = gym.make("CarRacing-v2", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=False)
    >>> env
    <TimeLimit<OrderEnforcing<PassiveEnvChecker<CarRacing<CarRacing-v2>>>>>

    ```

    * `lap_complete_percent=0.95` dictates the percentage of tiles that must be visited by
     the agent before a lap is considered complete.

    * `domain_randomize=False` enables the domain randomized variant of the environment.
     In this scenario, the background and track colours are different on every reset.

    * `continuous=True` converts the environment to use discrete action space.
     The discrete action space has 5 actions: [do nothing, left, right, gas, brake].

    ## Reset Arguments

    Passing the option `options["randomize"] = True` will change the current colour of the environment on demand.
    Correspondingly, passing the option `options["randomize"] = False` will not change the current colour of the environment.
    `domain_randomize` must be `True` on init for this argument to work.

    ```python
    >>> import gymnasium as gym
    >>> env = gym.make("CarRacing-v2", domain_randomize=True)

    # normal reset, this changes the colour scheme by default
    >>> obs, _ = env.reset()

    # reset with colour scheme change
    >>> randomize_obs, _ = env.reset(options={"randomize": True})

    # reset with no colour scheme change
    >>> non_random_obs, _ = env.reset(options={"randomize": False})

    ```

    ## Version History
    - v1: Change track completion logic and add domain randomization (0.24.0)
    - v0: Original version

    ## References
    - Chris Campbell (2014), http://www.iforce2d.net/b2dtut/top-down-car.

    ## Credits
    Created by Oleg Klimov
    """

    metadata = {
        "render_modes": [
            "human",
            "rgb_array",
            "state_pixels",
        ],
        "render_fps": FPS,
    }

    def __init__(
        self,
        render_mode: Optional[str] = None,
        verbose: bool = False,
        lap_complete_percent: float = 1.0,
        domain_randomize: bool = False,
        continuous: bool = True,
        constant_speed = 50,
        num_prev_errors = NUM_PREV_ERRORS
    ):
        EzPickle.__init__(
            self,
            render_mode,
            verbose,
            lap_complete_percent,
            domain_randomize,
            continuous,
        )
        self.continuous = continuous
        self.domain_randomize = domain_randomize
        self.lap_complete_percent = lap_complete_percent
        self._init_colors()


        self.center_line = []
        self._max_episode_steps = 2500
        self.episode_steps = 0
        self.constant_speed = constant_speed
        self.prev_errors = [0 for _ in range(num_prev_errors)]
        self.road_half_width = 7
        self.placeholder = 0


        self.contactListener_keepref = FrictionDetector(self, self.lap_complete_percent)
        self.world = Box2D.b2World((0, 0), contactListener=self.contactListener_keepref)
        self.screen: Optional[pygame.Surface] = None
        self.surf = None
        self.clock = None
        self.isopen = True
        self.invisible_state_window = None
        self.invisible_video_window = None
        self.road = None
        self.car: Optional[Car] = None
        self.reward = 0.0
        self.prev_reward = 0.0
        self.verbose = verbose
        self.new_lap = False
        self.fd_tile = fixtureDef(
            shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)])
        )

        # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric
        #   or normalised however this is not possible here so ignore
        if self.continuous:
            self.action_space = spaces.Box(
                np.array([-1]).astype(np.float64),
                np.array([+1]).astype(np.float64),
                seed = SEED

                #np.array([-1, 0, 0]).astype(np.float32),
                #np.array([+1, +1, +1]).astype(np.float32),
            )  # steer, gas, brake
        else:
            self.action_space = spaces.Discrete(2)
            # only steer left and right

        # obseravtion: [error_heading, CTE, Speed]
        if not VARIABLE_SPEED['On']:
            self.observation_space = spaces.Box(
                np.array([ -1, -1]).astype(np.float64),
                np.array([+1, +1]).astype(np.float64), seed = SEED)
        else:
            self.observation_space = spaces.Box(
                np.array([ -1, -1, 0]).astype(np.float64),
                np.array([+1, +1, 1]).astype(np.float64), seed = SEED)


        '''
            np.array([ -math.pi, -WINDOW_W, -2*self.road_half_width]).astype(np.float64),
            np.array([+math.pi, +WINDOW_W, +2*self.road_half_width]).astype(np.float64), seed = SEED)'''

        self.render_mode = render_mode

    def _destroy(self):
        if not self.road:
            return
        for t in self.road:
            self.world.DestroyBody(t)
        self.road = []
        assert self.car is not None
        self.car.destroy()

    def _init_colors(self):
        if self.domain_randomize:
            # domain randomize the bg and grass colour
            self.road_color = self.np_random.uniform(0, 210, size=3)

            self.bg_color = self.np_random.uniform(0, 210, size=3)

            self.grass_color = np.copy(self.bg_color)
            idx = self.np_random.integers(3)
            self.grass_color[idx] += 20
        else:
            # default colours
            self.road_color = np.array([57, 64, 83])
            self.bg_color = np.array([110, 99, 98])
            self.grass_color = np.array([131, 144, 115])

    def _reinit_colors(self, randomize):
        assert (
            self.domain_randomize
        ), "domain_randomize must be True to use this function."

        if randomize:
            # domain randomize the bg and grass colour
            self.road_color = self.np_random.uniform(0, 210, size=3)

            self.bg_color = self.np_random.uniform(0, 210, size=3)

            self.grass_color = np.copy(self.bg_color)
            idx = self.np_random.integers(3)
            self.grass_color[idx] += 20

    def _create_track(self):
        CHECKPOINTS = 12

        # Create checkpoints
        checkpoints = []
        for c in range(CHECKPOINTS):
            noise = self.np_random.uniform(0, 2 * math.pi * 1 / CHECKPOINTS)
            alpha = 2 * math.pi * c / CHECKPOINTS + noise
            rad = self.np_random.uniform(TRACK_RAD / 3, TRACK_RAD)

            if c == 0:
                alpha = 0
                rad = 1.5 * TRACK_RAD
            if c == CHECKPOINTS - 1:
                alpha = 2 * math.pi * c / CHECKPOINTS
                self.start_alpha = 2 * math.pi * (-0.5) / CHECKPOINTS
                rad = 1.5 * TRACK_RAD

            checkpoints.append((alpha, rad * math.cos(alpha), rad * math.sin(alpha)))
        self.road = []

        # Go from one checkpoint to another to create track
        x, y, beta = 1.5 * TRACK_RAD, 0, 0

        dest_i = 0
        laps = 0
        track = []
        no_freeze = 2500
        visited_other_side = False
        while True:
            alpha = math.atan2(y, x)
            if visited_other_side and alpha > 0:
                laps += 1
                visited_other_side = False
            if alpha < 0:
                visited_other_side = True
                alpha += 2 * math.pi

            while True:  # Find destination from checkpoints
                failed = True

                while True:
                    dest_alpha, dest_x, dest_y = checkpoints[dest_i % len(checkpoints)]
                    if alpha <= dest_alpha:
                        failed = False
                        break
                    dest_i += 1
                    if dest_i % len(checkpoints) == 0:
                        break

                if not failed:
                    break

                alpha -= 2 * math.pi
                continue

            r1x = math.cos(beta)
            r1y = math.sin(beta)
            p1x = -r1y
            p1y = r1x
            dest_dx = dest_x - x  # vector towards destination
            dest_dy = dest_y - y
            # destination vector projected on rad:
            proj = r1x * dest_dx + r1y * dest_dy
            while beta - alpha > 1.5 * math.pi:
                beta -= 2 * math.pi
            while beta - alpha < -1.5 * math.pi:
                beta += 2 * math.pi
            prev_beta = beta
            proj *= SCALE
            if proj > 0.3:
                beta -= min(TRACK_TURN_RATE, abs(0.001 * proj))
            if proj < -0.3:
                beta += min(TRACK_TURN_RATE, abs(0.001 * proj))
            x += p1x * TRACK_DETAIL_STEP
            y += p1y * TRACK_DETAIL_STEP
            track.append((alpha, prev_beta * 0.5 + beta * 0.5, x, y))

            self.center_line.append((x,y))

            if laps > 4:
                break
            no_freeze -= 1
            if no_freeze == 0:
                break

        # Find closed loop range i1..i2, first loop should be ignored, second is OK
        i1, i2 = -1, -1
        i = len(track)
        while True:
            i -= 1
            if i == 0:
                return False  # Failed
            pass_through_start = (
                track[i][0] > self.start_alpha and track[i - 1][0] <= self.start_alpha
            )
            if pass_through_start and i2 == -1:
                i2 = i
            elif pass_through_start and i1 == -1:
                i1 = i
                break
        if self.verbose:
            print("Track generation: %i..%i -> %i-tiles track" % (i1, i2, i2 - i1))
        assert i1 != -1
        assert i2 != -1

        track = track[i1 : i2 - 1]

        first_beta = track[0][1]
        first_perp_x = math.cos(first_beta)
        first_perp_y = math.sin(first_beta)
        # Length of perpendicular jump to put together head and tail
        well_glued_together = np.sqrt(
            np.square(first_perp_x * (track[0][2] - track[-1][2]))
            + np.square(first_perp_y * (track[0][3] - track[-1][3]))
        )
        if well_glued_together > TRACK_DETAIL_STEP:
            return False

        # Red-white border on hard turns
        border = [False] * len(track)
        for i in range(len(track)):
            good = True
            oneside = 0
            for neg in range(BORDER_MIN_COUNT):
                beta1 = track[i - neg - 0][1]
                beta2 = track[i - neg - 1][1]
                good &= abs(beta1 - beta2) > TRACK_TURN_RATE * 0.2
                oneside += np.sign(beta1 - beta2)
            good &= abs(oneside) == BORDER_MIN_COUNT
            border[i] = good
        for i in range(len(track)):
            for neg in range(BORDER_MIN_COUNT):
                border[i - neg] |= border[i]

        # Create tiles
        for i in range(len(track)):
            alpha1, beta1, x1, y1 = track[i]
            alpha2, beta2, x2, y2 = track[i - 1]
            road1_l = (
                x1 - TRACK_WIDTH * math.cos(beta1),
                y1 - TRACK_WIDTH * math.sin(beta1),
            )
            road1_r = (
                x1 + TRACK_WIDTH * math.cos(beta1),
                y1 + TRACK_WIDTH * math.sin(beta1),
            )
            road2_l = (
                x2 - TRACK_WIDTH * math.cos(beta2),
                y2 - TRACK_WIDTH * math.sin(beta2),
            )
            road2_r = (
                x2 + TRACK_WIDTH * math.cos(beta2),
                y2 + TRACK_WIDTH * math.sin(beta2),
            )
            vertices = [road1_l, road1_r, road2_r, road2_l]
            self.fd_tile.shape.vertices = vertices
            t = self.world.CreateStaticBody(fixtures=self.fd_tile)
            t.userData = t
            c = 0.01 * (i % 3) * 255
            t.color = self.road_color + c
            t.road_visited = False
            t.road_friction = 1.0
            t.idx = i
            t.fixtures[0].sensor = True
            self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color))
            self.road.append(t)
            if border[i]:
                side = np.sign(beta2 - beta1)
                b1_l = (
                    x1 + side * TRACK_WIDTH * math.cos(beta1),
                    y1 + side * TRACK_WIDTH * math.sin(beta1),
                )
                b1_r = (
                    x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1),
                    y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1),
                )
                b2_l = (
                    x2 + side * TRACK_WIDTH * math.cos(beta2),
                    y2 + side * TRACK_WIDTH * math.sin(beta2),
                )
                b2_r = (
                    x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2),
                    y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2),
                )
                self.road_poly.append(
                    (
                        [b1_l, b1_r, b2_r, b2_l],
                        (255, 255, 255) if i % 2 == 0 else (255, 0, 0),
                    )
                )
        self.track = track
        return True

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        self._destroy()
        self.world.contactListener_bug_workaround = FrictionDetector(
            self, self.lap_complete_percent
        )
        self.world.contactListener = self.world.contactListener_bug_workaround
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.new_lap = False
        self.road_poly = []


        ################
        if VARIABLE_SPEED['On']:
            self.constant_speed = np.random.uniform(VARIABLE_SPEED['min_speed'], VARIABLE_SPEED["max_speed"])

        self.episode_steps = 0
        ##############

        if self.domain_randomize:
            randomize = True
            if isinstance(options, dict):
                if "randomize" in options:
                    randomize = options["randomize"]

            self._reinit_colors(randomize)

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose:
                print(
                    "retry to generate track (normal if there are not many"
                    "instances of this message)"
                )
        self.car = Car(self.world, *self.track[0][1:4])


        if self.render_mode == "human":
            self.render()
        return self.step(None)[0], {}

    def step(self, action: Union[np.ndarray, int]):
        assert self.car is not None

        #############
        # constant speed
        if self.constant_speed != 0:
            for w in self.car.wheels[0:4]:
                    w.omega = self.constant_speed
        #############

        if action is not None:
            if self.continuous:
                self.car.steer(-action[0])
                #self.car.gas(action[1])
                #self.car.brake(action[2])
            else:
                if not self.action_space.contains(action):
                    raise InvalidAction(
                        f"you passed the invalid action `{action}`. "
                        f"The supported action_space is `{self.action_space}`"
                    )
                self.car.steer(-0.6 * (action == 1) + 0.6 * (action == 2))
                self.car.gas(0.2 * (action == 3))
                self.car.brake(0.8 * (action == 4))

        self.car.step(1.0 / FPS)
        self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
        self.t += 1.0 / FPS


        # Updating state
        #if variable speed is on, add speed as normalized state
        if VARIABLE_SPEED["On"]:
            self.state = self.getState()
        else:
            self.state = self.getState()[0:2]
        #print(self.state)
        self.update_prev_errors(self.state[1])

        step_reward = 0
        terminated = False
        truncated = False

        if action is not None:  # First step without action, called from reset()

            # Penalize oscilations using CTE's variance
            if(self.episode_steps>=NUM_PREV_ERRORS):
                self.reward-= self.get_CTE_variance()*CTE_RESCALE

            # Reward low CTE
            if abs(self.state[1]) <= self.road_half_width:
              self.reward += REWARD_VSHIFT - self.state[1]**2

            self.car.fuel_spent = 0.0
            step_reward = self.reward - self.prev_reward
            self.placeholder = step_reward

            self.prev_reward = self.reward

            if self.tile_visited_count == len(self.track) or self.new_lap:
                truncated = True

            if abs(self.get_cross_track_error(self.car, self.track)[1]) > self.road_half_width:
                step_reward = OFF_ROAD_PENALTY
                terminated = True

            # End episode when car goes off road
            self.episode_steps+=1
            if self.episode_steps > 2000 or self.new_lap:
                terminated = True

        if self.render_mode == "human":
            self.render()
        return self.state, step_reward, terminated, truncated, {}

    def render(self):
        if self.render_mode is None:
            assert self.spec is not None
            gym.logger.warn(
                "You are calling render method without specifying any render mode. "
                "You can specify the render_mode at initialization, "
                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
            )
            return
        else:
            return self._render(self.render_mode)

    def _render(self, mode: str):
        assert mode in self.metadata["render_modes"]

        pygame.font.init()
        if self.screen is None and mode == "human":
            pygame.init()
            pygame.display.init()
            self.screen = pygame.display.set_mode((WINDOW_W, WINDOW_H))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        if "t" not in self.__dict__:
            return  # reset() not called yet

        self.surf = pygame.Surface((WINDOW_W, WINDOW_H))

        assert self.car is not None
        # computing transformations
        angle = -self.car.hull.angle
        # Animating first second zoom.
        zoom = 0.1 * SCALE * max(1 - self.t, 0) + ZOOM * SCALE * min(self.t, 1)
        scroll_x = -(self.car.hull.position[0]) * zoom
        scroll_y = -(self.car.hull.position[1]) * zoom
        trans = pygame.math.Vector2((scroll_x, scroll_y)).rotate_rad(angle)
        trans = (WINDOW_W / 2 + trans[0], WINDOW_H / 4 + trans[1])

        self._render_road(zoom, trans, angle)
        self.car.draw(
            self.surf,
            zoom,
            trans,
            angle,
            mode not in ["state_pixels_list", "state_pixels"],
        )

        self.surf = pygame.transform.flip(self.surf, False, True)

        # showing stats
        self._render_indicators(WINDOW_W, WINDOW_H)

        font = pygame.font.Font(pygame.font.get_default_font(), 42)

        ########################################


        text = font.render("%.2f | %.2f" %  (self.placeholder,  self.get_CTE_variance() * 200), True, (255, 255, 255), (0, 0, 0))


        ########################################

        text_rect = text.get_rect()
        text_rect.center = (120, WINDOW_H - WINDOW_H * 2.5 / 40.0)
        self.surf.blit(text, text_rect)

        if mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            assert self.screen is not None
            self.screen.fill(0)
            self.screen.blit(self.surf, (0, 0))
            pygame.display.flip()
        elif mode == "rgb_array":
            return self._create_image_array(self.surf, (VIDEO_W, VIDEO_H))
        elif mode == "state_pixels":
            return self._create_image_array(self.surf, (STATE_W, STATE_H))
        else:
            return self.isopen

    def _render_road(self, zoom, translation, angle):
        bounds = PLAYFIELD
        field = [
            (bounds, bounds),
            (bounds, -bounds),
            (-bounds, -bounds),
            (-bounds, bounds),
        ]

        # draw background
        self._draw_colored_polygon(
            self.surf, field, self.bg_color, zoom, translation, angle, clip=False
        )

        # draw grass patches
        grass = []
        for x in range(-20, 20, 2):
            for y in range(-20, 20, 2):
                grass.append(
                    [
                        (GRASS_DIM * x + GRASS_DIM, GRASS_DIM * y + 0),
                        (GRASS_DIM * x + 0, GRASS_DIM * y + 0),
                        (GRASS_DIM * x + 0, GRASS_DIM * y + GRASS_DIM),
                        (GRASS_DIM * x + GRASS_DIM, GRASS_DIM * y + GRASS_DIM),
                    ]
                )
        for poly in grass:
            self._draw_colored_polygon(
                self.surf, poly, self.grass_color, zoom, translation, angle
            )

        # draw road
        i= 0
        for poly, color in self.road_poly:
            # converting to pixel coordinates
            poly = [(p[0], p[1]) for p in poly]
            color = [int(c) for c in color]
            self._draw_colored_polygon(self.surf, poly, color, zoom, translation, angle)

            i+=1


    def _render_indicators(self, W, H):
        s = W / 40.0
        h = H / 40.0
        color = (0, 0, 0)
        polygon = [(W, H), (W, H - 5 * h), (0, H - 5 * h), (0, H)]
        pygame.draw.polygon(self.surf, color=color, points=polygon)

        def vertical_ind(place, val):
            return [
                (place * s, H - (h + h * val)),
                ((place + 1) * s, H - (h + h * val)),
                ((place + 1) * s, H - h),
                ((place + 0) * s, H - h),
            ]

        def horiz_ind(place, val):
            return [
                ((place + 0) * s, H - 4 * h),
                ((place + val) * s, H - 4 * h),
                ((place + val) * s, H - 2 * h),
                ((place + 0) * s, H - 2 * h),
            ]

        assert self.car is not None
        true_speed = np.sqrt(
            np.square(self.car.hull.linearVelocity[0])
            + np.square(self.car.hull.linearVelocity[1])
        )

        # simple wrapper to render if the indicator value is above a threshold
        def render_if_min(value, points, color):
            if abs(value) > 1e-4:
                pygame.draw.polygon(self.surf, points=points, color=color)

        render_if_min(true_speed, vertical_ind(5, 0.02 * true_speed), (255, 255, 255))
        # ABS sensors
        render_if_min(
            self.car.wheels[0].omega,
            vertical_ind(7, 0.01 * self.car.wheels[0].omega),
            (0, 0, 255),
        )
        render_if_min(
            self.car.wheels[1].omega,
            vertical_ind(8, 0.01 * self.car.wheels[1].omega),
            (0, 0, 255),
        )
        render_if_min(
            self.car.wheels[2].omega,
            vertical_ind(9, 0.01 * self.car.wheels[2].omega),
            (51, 0, 255),
        )
        render_if_min(
            self.car.wheels[3].omega,
            vertical_ind(10, 0.01 * self.car.wheels[3].omega),
            (51, 0, 255),
        )

        render_if_min(
            self.car.wheels[0].joint.angle,
            horiz_ind(20, -10.0 * self.car.wheels[0].joint.angle),
            (0, 255, 0),
        )
        render_if_min(
            self.car.hull.angularVelocity,
            horiz_ind(30, -0.8 * self.car.hull.angularVelocity),
            (255, 0, 0),
        )

    def _draw_colored_polygon(
        self, surface, poly, color, zoom, translation, angle, clip=True
    ):
        poly = [pygame.math.Vector2(c).rotate_rad(angle) for c in poly]
        poly = [
            (c[0] * zoom + translation[0], c[1] * zoom + translation[1]) for c in poly
        ]
        # This checks if the polygon is out of bounds of the screen, and we skip drawing if so.
        # Instead of calculating exactly if the polygon and screen overlap,
        # we simply check if the polygon is in a larger bounding box whose dimension
        # is greater than the screen by MAX_SHAPE_DIM, which is the maximum
        # diagonal length of an environment object
        if not clip or any(
            (-MAX_SHAPE_DIM <= coord[0] <= WINDOW_W + MAX_SHAPE_DIM)
            and (-MAX_SHAPE_DIM <= coord[1] <= WINDOW_H + MAX_SHAPE_DIM)
            for coord in poly
        ):
            gfxdraw.aapolygon(self.surf, poly, color)
            gfxdraw.filled_polygon(self.surf, poly, color)

    def _create_image_array(self, screen, size):
        scaled_screen = pygame.transform.smoothscale(screen, size)
        return np.transpose(
            np.array(pygame.surfarray.pixels3d(scaled_screen)), axes=(1, 0, 2)
        )

    def close(self):
        if self.screen is not None:
            pygame.display.quit()
            self.isopen = False
            pygame.quit()



    ##########################################################

    ##########################################################

    # Added methods

    def point_segment_dist(self, p, a, b):
        n = b - a
        norm_n = np.linalg.norm(n)
        if norm_n < 1e-10:
            return np.linalg.norm(p - a)

        n = n / norm_n
        ap = a - p
        proj_on_line = ap.dot(n) * n

        if np.linalg.norm(proj_on_line) > norm_n:
            return min(np.linalg.norm(p - a), np.linalg.norm(p - b))

        return np.linalg.norm(ap - proj_on_line)

    def get_cross_track_error(self, car, track):
        # steer in [-1, 1], gas in [0, 1], break in [0 ,1]
        pld_min = np.finfo(float).max
        dest_min = 0

        p = car.hull.position
        p = np.array([p[0], p[1]])

        for i in range(1, len(track)):
            ai = np.array([track[i-1][2], track[i-1][3]])
            bi = np.array([track[i][2], track[i][3]])
            pld = self.point_segment_dist(p, ai, bi)
            if pld < pld_min:
                pld_min = pld
                dest_min = i

        target_heading = track[dest_min][1]
        error_heading = target_heading - car.hull.angle
        error_heading =  atan2(sin(error_heading), cos(error_heading))

        R_world_trackframe = np.array([ [cos(target_heading), sin(target_heading)],
                                        [-sin(target_heading), cos(target_heading)] ])

        p_trackframe_world = np.array( track[dest_min][2:4] ).reshape((2,1))
        p_car_world = np.array( [car.hull.position[0], car.hull.position[1]] ).reshape((2,1))

        p_car_trackframe = R_world_trackframe.dot(p_car_world - p_trackframe_world)
        error_dist = -1 * p_car_trackframe[0][0]

        #print (error_heading * 180.0 / 3.14, error_dist, p_car_trackframe[1][0])
        return error_heading, error_dist, dest_min

    def get_CTE_variance(self):
        # Calculate the mean
        mean = sum(self.prev_errors) / len(self.prev_errors)

        # Calculate the squared differences from the mean
        squared_diff = [(x - mean) ** 2 for x in self.prev_errors]

        # Calculate the variance
        return sum(squared_diff) / len(self.prev_errors)

    def getState(self):

        CTE = self.get_cross_track_error(self.car, self.track)[0:2]

        normalized_error_heading = 2 * CTE[0] / math.pi

        normalized_CTE = CTE[1] / self.road_half_width

        normalized_speed = ((self.constant_speed - VARIABLE_SPEED["min_speed"]) / (
            VARIABLE_SPEED["max_speed"]-VARIABLE_SPEED["min_speed"])) * (0.99) + 0.01

        return np.array([normalized_error_heading, normalized_CTE, normalized_speed], dtype=np.float64)

    def update_prev_errors(self, cur_error):
        self.prev_errors.insert(0, cur_error)
        self.prev_errors.pop()


  and should_run_async(code)


###Register Environment

In [5]:
# Registering custom enviroment

def registerEnv(ID):
    gym.envs.register(
        id=ID,
        entry_point=lambda:CarRacing,  # Specify the module and class name
        max_episode_steps=2000,
        kwargs={}
    )

env_id  = 'center_aligning'
# Register environment
registerEnv(env_id)

##Policy: TD3

### Actor network

In [9]:

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, l1_dim=100, l2_dim=100):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, l1_dim)
        self.l2 = nn.Linear(l1_dim, l2_dim)
        self.l3 = nn.Linear(l2_dim, action_dim)

        self.max_action = max_action
        self.tanh = nn.Tanh()

    def forward(self, x):

        x = relu(self.l1(x))
        x = relu(self.l2(x))
        x = self.max_action * self.tanh(self.l3(x))

        return x

###Critic Netwrok

In [10]:

class Q_function(nn.Module):
    def __init__(self, state_dim, action_dim, l1_dim=100, l2_dim=100):
        super(Q_function, self).__init__()
        self.l1 = nn.Linear(state_dim + action_dim, l1_dim)
        self.l2 = nn.Linear(l1_dim, l2_dim)
        self.l3 = nn.Linear(l2_dim, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)

        x = relu(self.l1(x))
        x = relu(self.l2(x))
        x = self.l3(x)

        return x

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, q1_l1_dim=100, q1_l2_dim=100,
                 q2_l1_dim=100, q2_l2_dim=100):
        super(Critic, self).__init__()

        # Q1
        self.q1 = Q_function(state_dim, action_dim, q1_l1_dim, q1_l2_dim)
        # Q2
        self.q2 = Q_function(state_dim, action_dim, q2_l1_dim, q2_l2_dim)

    def forward(self, x, a):
        x1 = self.q1(x, a)
        x2 = self.q2(x, a)

        return x1, x2

### Twin Delayed Deep Deterministic Policy (TD3)

In [11]:

class TD3(object):
    def __init__(
		self,
		state_dim,
		action_dim,
		max_action,
		discount=0.99,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	    ):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        self.max_action = max_action
        self.discount = discount
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def select_vectorized_action(self, states):
        states_tensor = torch.FloatTensor(states).to(device)
        actions = self.actor(states_tensor)
        return actions.cpu().data.numpy()

    def train(self, iterations, replay_buffer, tau, batch_size=256):
        for i in range(iterations):
            self.total_it += 1

            s, ns, ac, r, d = replay_buffer.sample(batch_size)

            state = torch.FloatTensor(s).to(device)
            next_state = torch.FloatTensor(ns).to(device)
            action = torch.FloatTensor(ac).to(device)
            reward = torch.FloatTensor(r).to(device)
            done = torch.FloatTensor(1 - d).to(device)

            with torch.no_grad():
			    # For next action,  consider the policy and add noise
                noise = (
                    torch.randn_like(action) * self.policy_noise
                    ).clamp(-self.noise_clip, self.noise_clip)

                next_action = (
				    self.actor_target(next_state) + noise
			    ).clamp(-self.max_action, self.max_action)

			    # Compute the target Q value
                target_Q1, target_Q2 = self.critic_target(next_state, next_action)
                min_target = torch.min(target_Q1, target_Q2)
                target_Q = reward + (done * self.discount * min_target)

            # Get current Q estimates
            curr_Q1, curr_Q2 = self.critic(state, action)

            # Calculate loss for crtic
            mse_loss_1 = nn.MSELoss()
            mse_loss_2 = nn.MSELoss()
            loss_for_critic =  mse_loss_1(curr_Q1, target_Q) + mse_loss_2(curr_Q2, target_Q)

            # Optimization for the critic
            self.critic_optimizer.zero_grad()
            loss_for_critic.backward()
            self.critic_optimizer.step()

            # update the policy
            if i % self.policy_freq == 0:
                loss_for_actor = -self.critic.q2(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                loss_for_actor.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename), map_location=torch.device('cpu')))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename), map_location=torch.device('cpu')))

##Training

###Utilities

In [12]:

class ReplayBuffer(object):
    def __init__(self, max_size=1e4):
        self.storage = []
        self.max_size = max_size
        self.ind = 0

    def add(self, state, next_state, action, reward, done):
        data = [state, next_state, action, reward, done]

        # if there is still space in storage, add data
        if len(self.storage) < self.max_size:
            self.storage.append(data)
            # space met, reset index back to 0
        else:
            self.storage[self.ind] = data
            self.ind += 1
            if self.ind == self.max_size:
                self.ind = 0

    def sample(self, batch_size):
        # randomly sample batch size number of past events
        indices = np.random.randint(0, len(self.storage), size=batch_size)
        states, next_states, actions, rewards, done = [], [], [], [], []

        for i in indices:

            s, ns, ac, r, d = self.storage[i]
            states.append(np.array(s, copy=False))
            next_states.append(np.array(ns, copy=False))
            actions.append(np.array(ac, copy=False))
            rewards.append(np.array(r, copy=False))
            done.append(np.array(d, copy=False))

        return np.array(states), np.array(next_states), np.array(actions), \
            np.array(rewards).reshape(-1, 1), np.array(done).reshape(-1, 1)



### Training Loop

In [13]:


MAX_TIME_STEPS = 1000000
max_episode_steps = 2000

NUM_PARALLEL_ENVS = 3
FIN_EPISODES_BEFORE_TRAIN = 4

#Options to change expl noise and tau
LOWER_EXPL_NOISE = {"On" : True, "Reward_Threshold":14000, 'Value': 0.001}
LOWER_TAU = {"On" : True, "Reward_Threshold":18000, 'Value': 0.0005}

#load already trained policy
LOAD_POLICY = {"On": False, 'init_time_steps': 1e4}

#Avg reward termination condition
AVG_REWARD_TERMIN_THRESHOLD = 19000
# Time steps below which a standard training iteration param is passed
MIN_EPS_TIMESTEPS = 500

# Specify the file name
LOGS_FILEPATH = './benchmarks/logs/TD3_log.csv'

with open(LOGS_FILEPATH, 'w', newline='') as file:
	log_writer = csv.writer(file)

	# Write headings
	log_writer.writerow(['r', 'l'])

# Runs policy for X episodes and returns average reward
def evaluate_policy(policy, eval_episodes=5):
	avg_reward = 0
	num_fin_episodes = 0
	obs, info = envs.reset()
	avg = 0
	while num_fin_episodes < eval_episodes:
		action = policy.select_vectorized_action(np.array(obs))
		obs, reward, done, _, info = envs.step(action)
		avg_reward += reward

        # when an episode ends in any environment
		if info.keys():

			finished = info['_final_observation']
			num_fin = np.count_nonzero(finished)

			num_fin_episodes += num_fin

			avg += np.sum(avg_reward[finished])

	avg /= num_fin_episodes
	print("---------------------------------------")
	print("Evaluation over %d episodes: %f" % (num_fin_episodes, avg))
	print("---------------------------------------")
	return avg

if __name__ == "__main__":

	start_timesteps = 1e3           	# How many time steps purely random policy is run for
	eval_freq = 1e4			             # How often (time steps) we evaluate
	max_timesteps = MAX_TIME_STEPS 		# Max time steps to run environment for
	save_models = True			    	# Whether or not models are saved

	expl_noise=0.01		                # Std of Gaussian exploration noise
	batch_size=256		                # Batch size for both actor and critic
	tau=0.005		                    # Target network update rate
	policy_noise=0.1		              # Noise added to target policy during critic update
	noise_clip=0.25	                  # Range to clip target policy noise


	file_name = "TD3_%s" % ( str(SEED))
	print("---------------------------------------")
	print ("Settings: %s" % (file_name))
	print("---------------------------------------")

	if not os.path.exists("./results"):
		os.makedirs("./results")
	if save_models and not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	# Register environment
	env_id  = 'center_maintaining'
	env.registerEnv(env_id)

	# Initialise vectorized environment
	num_envs= NUM_PARALLEL_ENVS
	envs = gym.make_vec(env_id, num_envs=num_envs, render_mode='human')

	#Counter to track finished episode within one iteration of parallel runs
	num_fin_episodes = 0

	# Set seeds
	torch.manual_seed(SEED)
	np.random.seed(SEED)

	state_dim = envs.single_observation_space.shape[0]
	action_dim = envs.single_action_space.shape[0]
	max_action = float(envs.single_action_space.high[0])

	# Initialize policy
	policy = TD3(state_dim, action_dim, max_action, policy_noise=policy_noise, noise_clip=noise_clip)

	# Load already trained policy
	if LOAD_POLICY["On"]:
		filename = "Policy_19(1)"
		directory = "./policies"
		policy.load(filename, directory)
		start_timesteps = 0

	# Init replay buffer
	replay_buffer = ReplayBuffer()

	# Evaluate untrained policy
	evaluations = []#evaluations = [evaluate_policy(policy)]

	total_timesteps = 0
	timesteps_since_eval = 0
	train_iteration = 0

	# array to track if the frist episode in all parallel env have ended
	all_done = np.full(num_envs, True, dtype=bool)

	episode_count = 0
	avg_reward = 0

	t0 = time.time()

	while total_timesteps < max_timesteps:

		if all_done.all():

			# calculate average reward over episodes
			if num_fin_episodes!=0: avg_reward /= num_fin_episodes

			if total_timesteps != 0 and (not LOAD_POLICY['On'] or total_timesteps>=LOAD_POLICY["init_time_steps"]):

				print("\nData Stats:\nTotal T: %d   Train itr: %d   Episodes T: %d   Best Reward: %f   Avg Reward: %f   --  Wallclk T: %d sec" % \
					(total_timesteps, train_iteration, episode_timesteps, max_reward, avg_reward, int(time.time() - t0)))

				# Store metrics
				with open(LOGS_FILEPATH, 'a', newline='') as file:
					log_writer = csv.writer(file)
					log_writer.writerow([avg_reward, episode_timesteps/num_fin_episodes])

				if avg_reward >= AVG_REWARD_TERMIN_THRESHOLD:
					print("\n\nAvg Reward Threshold Met -- Training Terminated\n")
					break

				# Lower learning rate
				if LOWER_TAU["On"] and avg_reward >= LOWER_TAU["Reward_Threshold"]:
					print("\n-------Lowered Tau to %f \n" % LOWER_TAU["Value"])
					LOWER_TAU["On"] = False

                # Lower exploration noise
				if LOWER_EXPL_NOISE["On"] and avg_reward >= LOWER_EXPL_NOISE["Reward_Threshold"]:
					expl_noise = expl_noise / 2
					print("\n-------Lowered expl noise to %f \n" % LOWER_EXPL_NOISE["Value"])
					LOWER_EXPL_NOISE["On"] = False

				# save each policy with above stats before training
				policy.save("Policy_%d" % (train_iteration), directory="./policies")

				print("\nTraining: ", end=" ")
				if episode_timesteps < MIN_EPS_TIMESTEPS:
					print("STANDARDIZED TRAINING ITERATIONS")
					policy.train(MIN_EPS_TIMESTEPS, replay_buffer, tau, batch_size)
				else:
					policy.train(episode_timesteps, replay_buffer, tau, batch_size)

				print("-Finished ")
				print("\n-----------------------")

			# Evaluate episode
			if timesteps_since_eval >= eval_freq:
				timesteps_since_eval %= eval_freq
				eval_score = evaluate_policy(policy)
				evaluations.append(eval_score)

				if save_models: policy.save(file_name, directory="./pytorch_models")
				np.save("./results/%s" % (file_name), evaluations)

			# Reset environment
			print("\nCollecting data:")

			obs, info = envs.reset(seed=[SEED + i for i in range(num_envs)])
			SEED+=num_envs

			all_done = np.full(num_envs, False, dtype=bool)
			episode_reward = np.zeros(num_envs, dtype=float)
			episode_timesteps = 0
			train_iteration += 1

			max_reward = None
			avg_reward = 0
			num_fin_episodes = 0

		# Select action randomly or according to policy
		if total_timesteps == start_timesteps:
			print("\n\n\nPolicy actions started\n\n\n")

		if total_timesteps < start_timesteps:
			# Random actions for each environment
			action = envs.action_space.sample()
		else:
			action = policy.select_vectorized_action(obs)

			if expl_noise != 0:
				action = (action + np.random.normal(0, expl_noise, size=envs.single_action_space.shape[0])).clip(envs.single_action_space.low, envs.single_action_space.high)

		# Perform action
		new_obs, reward, done, truncated, info = envs.step(action)
		episode_reward += reward

		# when an episode ends in any environment
		if info.keys():

			finished = info['_final_observation']
			num_fin = np.count_nonzero(finished)

			num_fin_episodes += num_fin
			episode_count += num_fin

            # all_done marks the environments whose episodes ended
			all_done = np.logical_or(all_done, finished)

			print("Episode%d reward for finished enviroments:" % episode_count, episode_reward[finished])

            #Set min reward among finished episodes
			if max_reward is not None:
				max_reward = max(max_reward, max(episode_reward[finished]))
			else:
				max_reward = max(episode_reward[finished])

			avg_reward += sum(episode_reward[finished])

			#set episode reward for respective environments 0
			episode_reward[finished] = 0

		done_bool = np.full(num_envs, False, dtype=bool) if episode_timesteps + 1 == max_episode_steps else all_done

		# Store data in replay buffer
		for i in range(num_envs):
			if info.keys() and info['_final_observation'][i] == True:
				replay_buffer.add(obs[i], info['final_observation'][i], action[i], reward[i], float(all_done[i]))
			else:
				replay_buffer.add(obs[i], new_obs[i], action[i], reward[i], float(all_done[i]))

		obs = new_obs

        #   Episode time_steps for all episodes in each environment
		episode_timesteps += num_envs
		total_timesteps += num_envs
		timesteps_since_eval += num_envs

	# Final evaluation
	evaluations.append(evaluate_policy(policy))
	if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
	np.save("./results/%s" % (file_name), evaluations)

	envs.close()



FileNotFoundError: [Errno 2] No such file or directory: './benchmarks/logs/TD3_log.csv'

## Benchmarks

In [None]:


PPO_TRAIN_TIME_STEPS = 2000
DDPG_TRAIN_TIME_STEPS = 10000
SAC_TRAIN_TIME_STEPS = 2000

LEARNING_RATE = 0.001

def evaluate_policy(policy, env, num_episodes=1):
    total_reward = 0
    for i in range(num_episodes):
        state, info = env.reset()
        next_state = state
        terminated, truncated = False, False
        while not terminated and not truncated:
            action = policy.predict(np.array(next_state), deterministic=True)
            next_state, reward, terminated, truncated, info = env.step(action[0])
            total_reward += reward

    avg_reward = total_reward / num_episodes

    return avg_reward

# logging directories for each algorithm

#Monitor
ppo_log_dir = "./benchmarks/logs/ppo_logs/"
sac_log_dir = "./benchmarks/logs/sac_logs/"
ddpg_log_dir = "./benchmarks/logs/ddpg_logs/"


######## Training ########
train = 1
if train:

    # set up loggers
    ppo_logger = configure(ppo_log_dir, ["stdout", "csv"])
    sac_logger = configure(sac_log_dir, ["stdout", "csv"])
    ddpg_logger = configure(sac_log_dir, ["stdout", "csv"])

    # Instantiate the env
    ppo_env = CarRacing(render_mode = 'human')
    sac_env = CarRacing(render_mode = 'human')
    ddpg_env = CarRacing(render_mode = 'human')


    # Create monitor wrappers for each algorithm with unique logging directories
    ppo_env = Monitor(ppo_env, ppo_log_dir)
    sac_env = Monitor(sac_env, sac_log_dir)
    ddpg_env = Monitor(ddpg_env, ddpg_log_dir)

    # PPO model
    PPO_model = PPO("MlpPolicy", ppo_env, verbose = 1, learning_rate= LEARNING_RATE)
    PPO_model.set_logger(ppo_logger)

    print("Training PPO")
    PPO_model.learn(total_timesteps=PPO_TRAIN_TIME_STEPS)
    PPO_model.save("./benchmarks/ppo_policy")

    # SAC
    SAC_model = SAC("MlpPolicy", sac_env, verbose=1, learning_rate=LEARNING_RATE)
    SAC_model.set_logger(sac_logger)

    print("Training SAC")
    SAC_model.learn(total_timesteps=SAC_TRAIN_TIME_STEPS)
    SAC_model.save("./benchmarks/sac_policy")

    # DDPG
    # The noise objects for DDPG
    n_actions = ddpg_env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.01 * np.ones(n_actions))

    DDPG_model = DDPG("MlpPolicy", ddpg_env, action_noise=action_noise, verbose=1, tau=0.001)
    DDPG_model.set_logger(ddpg_logger)

    print("Training DDPG")
    DDPG_model.learn(total_timesteps=DDPG_TRAIN_TIME_STEPS)
    DDPG_model.save("./benchmarks/ddpg_policy")

    print("Training complete")

    ppo_env.close()
    sac_env.close()
    ddpg_env.close()



######## Evaluation ########
env = CarRacing(render_mode = 'human')

PPO_model = PPO.load("./benchmarks/ppo_policy")
SAC_model = SAC.load("./benchmarks/sac_policy")
DDPG_model = DDPG.load("./benchmarks/ddpg_policy")

print("Evaluating PPO")
#PPO_avg_reward = evaluate_policy(PPO_model, env)

print("Evaluating SAC")
SAC_avg_reward = evaluate_policy(SAC_model, env)

print("Evaluating DDPG")
#DDPG_avg_reward = evaluate_policy(DDPG_model, env)

env.close()

#print("\nPPO Average Reward:", PPO_avg_reward)
#print("\nSAC Average Reward:", SAC_avg_reward)
#print("\nDDPG Average Reward:", DDPG_avg_reward)

# Load monitoring data for each algorithm
ppo_monitor_df = pd.read_csv(os.path.join(ppo_log_dir, 'monitor.csv'),skiprows=[0],  index_col=None)
sac_monitor_df = pd.read_csv(os.path.join(sac_log_dir, 'monitor.csv'), skiprows=[0], index_col=None)
ddpg_monitor_df = pd.read_csv(os.path.join(ddpg_log_dir, 'monitor.csv'), skiprows=[0], index_col=None)
td3_df = pd.read_csv('./benchmarks/logs/TD3_log.csv', index_col=None)

# Plot learning curves

# Rewards vs Episodes
plt.figure(figsize=(10, 5))
plt.plot(ppo_monitor_df['r'], label='PPO')
plt.plot(sac_monitor_df['r'], label='SAC')
plt.plot(ddpg_monitor_df['r'], label='DDPG')
plt.plot(td3_df['r'], label='TD3')
plt.xlabel('Episodes')
plt.ylabel('Episode Reward')
plt.title('Learning Curves: Rewards vs Episodes')
plt.xticks(rotation=90)
plt.legend()
plt.show()

# Episode length vs Episodes
plt.figure(figsize=(10, 5))
plt.plot(ppo_monitor_df['l'], label='PPO')
plt.plot(sac_monitor_df['l'], label='SAC')
plt.plot(ddpg_monitor_df['l'], label='DDPG')
plt.plot(td3_df['l'], label='TD3')
plt.xlabel('Episodes')
plt.ylabel('Episode length')
plt.title('Learning Curves: Episode len vs Episodes')
plt.xticks(rotation=90)
plt.legend()
plt.show()

# Rewards + Episode len vs Episodes
plt.figure(figsize=(10, 5))
plt.plot(ppo_monitor_df['l'] + ppo_monitor_df['r'], label='PPO')
plt.plot(sac_monitor_df['l'] + sac_monitor_df['r'], label='SAC')
plt.plot(ddpg_monitor_df['l'] + ddpg_monitor_df['r'], label='DDPG')
plt.plot(td3_df['l'] + td3_df['r'], label='TD3')
plt.xlabel('Episodes')
plt.ylabel('Episode len + Rewards')
plt.title('Learning Curves: Episode len + Rewards vs Episodes')
plt.xticks(rotation=90)  # Rotate the y-axis labels by 45 degrees
plt.legend()
plt.show()