In [1]:
!pip install gym==0.26.2 gym[atari]==0.26.2

Collecting ale-py~=0.8.0 (from gym[atari]==0.26.2)
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.8.1


In [2]:
!pip install autorom[accept-rom-license]
!pip install gym[atari,accept-rom-license]==0.26.2

Collecting autorom[accept-rom-license]
  Downloading AutoROM-0.6.1-py3-none-any.whl.metadata (2.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25ldone
[?25h  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.6.1-py3-none-any.whl size=446659 sha256=13993d50d51495367b70c62401dad5630bd559aa4e933e4b152d0f66826bf06a
  Stored in directory: /root/.cache/pip/wheels/6b/1b/ef/a43ff1a2f

## Downloading an environment, resolving lots of problems is like 10% of the time

In [3]:
import gym
import matplotlib.pyplot as plt
import os
import random
import torch
import copy
import torch.nn as nn
import numpy as np
from torch.optim import Adam
from itertools import count
from collections import deque
os.environ["SDL_VIDEODRIVER"] = "dummy"
from gym.wrappers.monitoring import video_recorder

In [4]:
env = gym.make("BreakoutNoFrameskip-v4", render_mode='rgb_array')

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [5]:
device = 'cpu'

## Start with preprocessing states

In [6]:
from gym.core import ObservationWrapper
import cv2
from torchvision.transforms.functional import resize
from torchvision.transforms.functional import crop
# will use this built-in class

In [7]:
class Preprocessing(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)

        self.image_size = (64, 64)
        self._gray_scale_rule = torch.tensor([[0.8, 0.1, 0.1]], dtype=torch.float32).to(device).T.unsqueeze(1).to(device)

    def _gray_scale(self, img):
        return torch.matmul(img, self._gray_scale_rule.reshape(-1, 1))

    def observation(self, img):
        img = img[90:200, 3:157]
        img = cv2.resize(img, self.image_size, interpolation=cv2.INTER_AREA)
        img = torch.tensor(img, dtype=torch.float32).to(device)
        img = self._gray_scale(img).squeeze(-1)
        img = img.unsqueeze(0) / 255
        return img

# 90: 200, 3:157 

## Now incorporate some rules and preprocessing:

In [8]:
# code from openai/baselines but remaked due to version of gym

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, info, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, info, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs, info

class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, done, info, _ = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condition for a few frames
            # so it's important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info, _

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs, info = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, info, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs, info

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info, _ = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info, _

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)


In [9]:
from gym.wrappers import FrameStack

def AtariWrap(env):
    # env = MaxAndSkipEnv(env, skip=2)
    # env = FireResetEnv(env)
    # env = ClipRewardEnv(env)
    env = Preprocessing(env)
    env = FrameStack(env, 2)
    return env
env = AtariWrap(env)

In [10]:
def process_observation(obs):
    return torch.tensor(np.array([frame.to('cpu') for frame in obs._frames]), dtype=torch.float32).unsqueeze(0)

## Create a DeepLearning model which will get a sequence of 4 frames

In [11]:
class Agent(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.n = n_actions

        self.conv1 = nn.Conv3d(2, 4, kernel_size=(1, 8, 8), stride=2)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv3d(4, 8, kernel_size=(1, 4, 4), stride=2)
        self.relu2 = nn.ReLU()
        self.conv3 = nn.Conv3d(8, 4, kernel_size=(1, 3, 3))
        self.relu3 = nn.ReLU()
        self.conv4 = nn.Conv3d(4, 2, kernel_size=(1, 2, 2))
        self.relu4 = nn.ReLU()
    

        self.bottleneck = nn.Flatten()

        self.linear1 = nn.Linear(200, 10)
        self.relu5 = nn.ReLU()
        self.linear2 = nn.Linear(10, n_actions)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        
        x = self.bottleneck(x)
        
        x = self.linear1(x)
        x = self.relu5(x)
        x = self.linear2(x)
        
        return x

    def select_action(self, observation, eps_greedy=True, eps=0.2):
        if eps_greedy:
            if random.random() > eps:
                q = self(observation).squeeze(0)
                max_q = torch.argmax(q)
                return max_q
            else:
                return random.randint(0, self.n-1)
        else:
            q = self(observation).squeeze(0)
            max_q = torch.argmax(q)
            return max_q

    def get_q_value(self, state, action):
        if len(action.shape) == 0:
            return self(state)[0][action.item()]
        else:
            return self(state).gather(dim=1, index=action)

## Custom loss function creation

In [12]:
def compute_q_loss(model, model_target, states, actions, rewards, next_states, dones, gamma, alpha):
    if len(states.shape) != 5:
        states = states.unsqueeze(0)
        next_states = next_states.unsqueeze(0)

    Q_target = torch.max(model_target(next_states.to(device)))
        
    first = model.get_q_value(states.to(device), actions.to(device))
    second = rewards.to(device) + gamma * Q_target * dones.to(device)
    return torch.clip(loss_fn(second, first), -1, 1)

## Load model to continue training

In [13]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = True
    
    model.eval()
    
    return model
# model = load_checkpoint('/kaggle/input/newmodel18k/pytorch/ver1/1/checkpoint (31).pth')

## Training process

In [14]:
model = Agent(env.action_space.n).to(device)
loss_fn = nn.SmoothL1Loss()

In [15]:
def train(env, model, episodes, eps_start=0.9, alpha=1e-3, gamma=0.99, buffer_size=100000, batch_size=32, target_model_update=10000):
    n_done = 0
    print("Started training")
    history = []
    
    model_target = copy.deepcopy(model).to(device)
    model.train()
    model.to(device)
    optim = Adam(params=model.parameters(), lr=5e-5)
    
    
    T = 0
    total_reward = 0
    eps = eps_start
    
    replay_buffer = deque(maxlen=buffer_size)
    
    while n_done < episodes:
        state, info = env.reset()
        state = process_observation(state).to(device)
        
        with torch.no_grad():
            action = model.select_action(state, eps=eps)

        for t in count(0, 1):
            T += 1
            next_state, reward, terminated, done, _info = env.step(action)
            next_state = process_observation(next_state).to(device)
            total_reward += reward
            
            replay_buffer.append((state.to('cpu'), torch.tensor([action], dtype=torch.int64), torch.tensor([reward], dtype=torch.float32), next_state.to('cpu'), torch.tensor([not terminated], dtype=torch.bool)))

            if terminated:
                history += [total_reward]
                if n_done % 10 == 0:
                    print("episode:", n_done, 'completed by:', t, 'avg. reward:', np.mean(history))
                    
                if T > target_model_update:
                    with torch.no_grad():
                        model_target.load_state_dict(model.state_dict())
                    T = 0
                total_reward = 0
                eps = np.clip(eps - 1/episodes, 0.1, 1)
                n_done += 1
                break
                                                
            with torch.no_grad():
                next_action = model.select_action(next_state, eps=eps)
            state = next_state
            action = next_action
            
            if (n_done % 10 == 0) and t==100:
                with torch.no_grad():
                    print('q_values:', model(next_state))
            
            if len(replay_buffer) > 1000:
                transitions = random.sample(replay_buffer, batch_size)
                states = torch.as_tensor(np.asarray([t[0].squeeze(0) for t in transitions]), dtype=torch.float32)
                actions = torch.as_tensor(np.asarray([t[1] for t in transitions]), dtype=torch.int64)
                rewards = torch.as_tensor(np.asarray([t[2] for t in transitions]), dtype=torch.float32)
                next_states = torch.as_tensor(np.asarray([t[3].squeeze(0) for t in transitions]), dtype=torch.float32)
                dones = torch.as_tensor(np.asarray([t[4] for t in transitions]), dtype=torch.int64)
                
                loss = compute_q_loss(model, model_target, states, actions, rewards, next_states, dones, gamma, alpha)
                optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_value_(model.parameters(), 2)
                optim.step()
    model.eval()
    return model, history



In [16]:
model, history = train(env=env, model=model, episodes=250, eps_start=0.7, alpha=1e-3, gamma=0.99, buffer_size=100000, batch_size=32, target_model_update=2000)

Started training


  if not isinstance(terminated, (bool, np.bool8)):


q_values: tensor([[-0.1928,  0.1244,  0.3228,  0.2973]])
episode: 0 completed by: 790 avg. reward: 2.0



KeyboardInterrupt



In [None]:
model.to('cpu')
model.eval()
checkpoint = {'model': Agent(4), 'state_dict': model.state_dict()}

torch.save(checkpoint, 'checkpoint2.pth')