<a href="https://colab.research.google.com/github/KevinArmbruster/ReinforcementLearning/blob/main/Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install --upgrade gym[all]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ale-py~=0.7.5
  Downloading ale_py-0.7.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 14.5 MB/s 
[?25hCollecting imageio>=2.14.1
  Downloading imageio-2.22.4-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 64.9 MB/s 
[?25hCollecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[K     |████████████████████████████████| 374 kB 75.5 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 59.3 MB/s 
[?25hCollecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 67.4 MB/s 
[?25hCollecting mujoco-py<2.2,>=2.1
  Downloading mujoco_py-2.1.2.14

In [1]:
from collections import deque, namedtuple

import numpy as np
import torch
import torch.nn as nn
import itertools
import gym
import matplotlib.pyplot as plt
from tqdm import trange

import gym
import ale_py

print('gym:', gym.__version__)
print('ale_py:', ale_py.__version__)

env = gym.make('Breakout-v0')

ModuleNotFoundError: ignored

In [2]:
class StateActionValueNetwork(nn.Module):

    def __init__(self, dim_states: int, n_actions: int, hidden_layer_sizes: list, lr: float, weight_decay: float = 0):
        super(StateActionValueNetwork, self).__init__()
        self.dim_states = dim_states
        self.n_actions = n_actions
        self.hidden_layer_sizes = hidden_layer_sizes
        self.layers = None
        self.activation_functions = None
        self.optimizer = None
        self.lr = lr
        self.max_grad_norm = 1  # advised range [0.5, 2]
        self.weight_decay = weight_decay

        self.setup_NN(dim_states, hidden_layer_sizes, n_actions)
        self.setup_optimizer()

    def setup_NN(self, dim_states, hidden_layer_sizes, n_actions, intermediate_af=nn.ReLU(), last_af=None):
        layers = nn.ModuleList()
        activation_functions = nn.ModuleList()

        nn.Sequential(
            nn.Conv2d(3, 32, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.Conv2d(32,64, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),


            nn.Flatten(),
            nn.Linear(82944,1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512,6)
        )

        self.layers = layers
        self.activation_functions = activation_functions

    def setup_optimizer(self):
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)

    def forward(self, state: torch.Tensor):
        # out = torch.from_numpy(state)
        out = state

        for layer, af in zip(self.layers, self.activation_functions):
            if af:
                out = layer(out)
                out = af(out)
            else:
                out = layer(out)

        return out

    def backward(self, current, targets):
        # Training process, set gradients to 0
        self.optimizer.zero_grad()

        # Compute loss function
        loss = nn.functional.mse_loss(current, targets)

        # Compute gradient
        loss.backward()

        # Clip gradient norm to 1
        nn.utils.clip_grad_norm_(self.parameters(), max_norm=self.max_grad_norm)

        # Perform backward pass (backpropagation)
        self.optimizer.step()

        return (loss / len(current)).detach().numpy()


Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])


class ExperienceReplayBuffer(object):

    def __init__(self, maximum_length):
        self.buffer = deque(maxlen=int(maximum_length))

    def append(self, experience):
        self.buffer.append(experience)

    def __len__(self):
        return len(self.buffer)

    def sample_batch(self, n, combined=True):
        n = int(n)
        if n > len(self.buffer):
            raise IndexError('Tried to sample too many elements from the buffer!')

        if combined:
            n -= 1  # fetch 1 less
            indices = np.random.choice(len(self.buffer) - 1, size=n, replace=False)  # consider only rest of experiences
            batch = [self.buffer[i] for i in indices]
            batch.append(self.buffer[-1])  # append newest experience
        else:
            indices = np.random.choice(len(self.buffer), size=n, replace=False)
            batch = [self.buffer[i] for i in indices]

        # convert a list of tuples into a tuple of list we do zip(*batch)
        return zip(*batch)

In [7]:
env = gym.make("MountainCar-v0")
env.reset()

NameNotFound: ignored