## Environment Setup

In [None]:
!pip install tensorboardX
!pip install pyglet==1.5.1
!pip install torchsummary
!pip install optuna
!pip install optuna-dashboard

In [None]:
!pip install torchrl
!pip install setuptools==65.5.1
!pip install gym==0.21.0
!pip install stable-baselines3[extra]
!pip install lz4

In [None]:
# !sudo apt-get install -y xvfb
!pip install pyvirtualdisplay

In [None]:
!nvidia-smi

## Imports

In [1]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1024, 768))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f9ff45ab610>

In [2]:
import sys
import os

# Get the absolute path to the parent directory of gym-tetris
gym_tetris_parent_path = os.path.abspath(os.path.join('..', 'gym-tetris'))

# Append the path to the sys.path
sys.path.append(gym_tetris_parent_path)

In [3]:
import random
import time
from distutils.util import strtobool

# import gym
from gym import Wrapper, ObservationWrapper
from gym.wrappers import RecordEpisodeStatistics, RecordVideo, FrameStack
from gym.spaces import Box, Discrete

import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

# from stable_baselines3.common.buffers import ReplayBuffer

from tensordict import TensorDict
# from torchrl.data import PrioritizedReplayBuffer, ListStorage
from torchrl.data import TensorDictPrioritizedReplayBuffer, LazyMemmapStorage, LazyTensorStorage

from nes_py.wrappers import JoypadSpace
import gym_tetris
from gym_tetris.actions import SIMPLE_MOVEMENT

from torchsummary import summary
from collections import deque



## Model

In [4]:
class QNetwork(nn.Module):
    def __init__(self, actions_num, frame_stack=1):
        super().__init__()
        self.network =  nn.Sequential(
            # (frame_stack, 20, 10)
            nn.Flatten(),
            # 200 x frame_stack
            nn.Linear(200*frame_stack, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, actions_num),
        )

    def forward(self, x):
        return self.network(x)


def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

In [5]:
# Convolutional on 20x10
# class QNetwork(nn.Module):
#     def __init__(self, actions_num, frame_stack=1):
#         super().__init__()
#         self.network =  nn.Sequential(
#             # (frame_stack, 20, 10)
#             nn.Conv2d(1, 32, 4, stride=2),
#             nn.ReLU(),
#             # (32, 9, 4)
#             nn.Conv2d(32, 64, 2, stride=1),
#             nn.ReLU(),
#             # (32, 8, 3)
#             nn.Flatten(),
#             # 768
#             nn.Linear(768, 512),
#             nn.ReLU(),
#             nn.Linear(512, actions_num)
#         )

#     def forward(self, x):
#         return self.network(x)


# def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
#     slope = (end_e - start_e) / duration
#     return max(slope * t + start_e, end_e)

## Training

In [6]:
class FrameSkipEnv(Wrapper):
    def __init__(self, env=None, skip=4):
        super(FrameSkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            # Only do the action on the first frame (action 0 is always NOOP)
            real_action = 0 if (i > 0) else action
            obs, reward, done, info = self.env.step(real_action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

    def reset(self):
        obs = self.env.reset()
        return obs

In [7]:
GAME_BOX = 47, 95, 209, 176
BOARD_SHAPE = 20, 10
y_step = (GAME_BOX[2] - GAME_BOX[0]) // BOARD_SHAPE[0]
x_step = (GAME_BOX[3] - GAME_BOX[1]) // BOARD_SHAPE[1]

# Given an image of the current board, obtain a binary (20x10) representation
class BinaryBoard(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(0, 1, BOARD_SHAPE)

    def observation(self, obs):
        gray = np.mean(obs, axis=-1)
        cropped = gray[GAME_BOX[0]+(y_step//2) : GAME_BOX[2] : y_step,
                       GAME_BOX[1]+(x_step//2) : GAME_BOX[3] : x_step]
        assert cropped.shape == BOARD_SHAPE
        cropped[cropped > 1] = 1
        return cropped

In [8]:
class TensorWrapper(ObservationWrapper):
    def __init__(self, env=None, device='cpu'):
        super(TensorWrapper, self).__init__(env)
        self._device = device
    
    def observation(self, obs):
        ret_obs = np.expand_dims(np.array(obs), axis=0)
        ret_obs = torch.Tensor(ret_obs).to(self._device)
        return ret_obs

In [9]:
INNER_SKIP = 16
OUTER_SKIP = 3
# Making an environment
def get_env(env_id, seed, capture_video, run_name, video_freq=100, frame_stack=4, device='cpu'):
    env = gym_tetris.make(env_id)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    
    env = RecordEpisodeStatistics(env)
    if capture_video:
        env = RecordVideo(env, f"videos/{run_name}", episode_trigger=lambda ep_num: ep_num % video_freq == 0)
    
    env = BinaryBoard(env)
    env = FrameSkipEnv(env, skip=INNER_SKIP)
    env = FrameStack(env, frame_stack)
    env = FrameSkipEnv(env, skip=OUTER_SKIP)
    env = TensorWrapper(env, device)

    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env

In [10]:
FPS = 60 / (INNER_SKIP * OUTER_SKIP)
SCALE_UP = 10

In [11]:
# Evaluation
def evaluate(
    model: torch.nn.Module,
    env_id: str,
    eval_episodes: int,
    run_name: str,
    seed: int,
    device: torch.device = torch.device("cpu"),
    capture_video: bool = True,
    video_frequency: int = 1,
    frame_stack: int = 1
):
    env = get_env(env_id, seed, capture_video, run_name, video_frequency, frame_stack, device=device) 
    
    model.eval()

    scores = []
    for episode in range(eval_episodes):
        if capture_video:
            out = cv2.VideoWriter(f'eval_episode{episode}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)
        
        obs = env.reset()
        done = False
        while not done:
            if capture_video:
                img = np.array(obs).astype('uint8')
                if frame_stack > 1:
                    img = img[-1]
                out.write(img)

            q_values = model(obs)
            action = int(torch.argmax(q_values))
            obs, _, done, info = env.step(action)
        
        print(f"eval_episode={len(scores)}, score={info.get('score')}, episodic_return={info.get('episode')['r']}")
        scores.append(info.get("score"))

    env.close()
    return scores

In [12]:
def write_obs(out, obs, frame_stack : int = 1, scale_up : int = 1):
    img = np.array(obs.cpu())[0].astype('uint8')*255
    if frame_stack > 1:
        img = img[-1]
    if scale_up > 1:
        img = np.repeat(np.repeat(img, scale_up, axis=0), scale_up, axis=1)
    out.write(img)

In [13]:
def write_scalars(writer, global_step, info, epsilon):
    writer.add_scalar("charts/episodic_return", info.get("episode")["r"], global_step)
    writer.add_scalar("charts/episodic_length", info.get("episode")["l"], global_step)
    writer.add_scalar("charts/epsilon", epsilon, global_step)
    writer.add_scalar("charts/score", info.get("score"), global_step)

In [14]:
# Single env training without optuna - for simplicity
def train(args, start_model_path=None):
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{args.run_id}"
    prefix = ""
    
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

    # TRY NOT TO MODIFY: seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device_name = "cuda" if torch.cuda.is_available() and args.cuda else "cpu"
    device_name = "mps" if torch.backends.mps.is_available() and args.mps else device_name
    device = torch.device(device_name)

    print("device_name:", device_name)

    # env setup
    env = get_env(args.env_id, args.seed, args.capture_video, run_name, args.video_frequency, args.frame_stack, device=device)
    assert isinstance(env.action_space, Discrete), "only discrete action space is supported"

    q_network = QNetwork(env.action_space.n, args.frame_stack).to(device)
    if start_model_path is not None:
        state_dict = torch.load(start_model_path)
        q_network.load_state_dict(state_dict)
    
    optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
    target_network = QNetwork(env.action_space.n, args.frame_stack).to(device)
    target_network.load_state_dict(q_network.state_dict())

    summary(q_network, input_size=(args.frame_stack, *BOARD_SHAPE), batch_size=args.batch_size, device=device_name)

    rb = TensorDictPrioritizedReplayBuffer(
        alpha=args.alpha,
        beta=args.beta,
        storage=LazyTensorStorage(args.buffer_size, device=device),
        batch_size=args.batch_size,
        prefetch=args.prefetch
    )

    obs = env.reset()

    # Tracks number of episodes simulated
    episode_cnt = 0
    # Tracks the number of pieces we have played
    piece_count = 0
    # Whether we explore (play random moves) or exploit (play according to the model)
    explore = True
    info = None

    # Track the best scoring models
    scores = deque(maxlen=args.mean_score_count)
    best_mean_score = -1.0

    if args.capture_inputs_video:
        out = cv2.VideoWriter(f'episode0.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)

    prev_time = time.time()

    for global_step in range(args.total_timesteps):

        if global_step > 0 and global_step % 100 == 0:
           curr_time = time.time()
           writer.add_scalar("charts/SPS", 100 / (curr_time - prev_time), global_step)
           prev_time = curr_time
        
        if args.capture_inputs_video and (episode_cnt % args.video_frequency == 0):
            write_obs(out, obs, args.frame_stack, SCALE_UP)
        
        # If a new piece has been generated, decide wether we will explore or exploit for this piece
        if (info is not None) and (piece_count != info.get("piece_count")):
            piece_count = info.get("piece_count")
            if global_step < args.learning_starts:
                epsilon = args.start_e
            else:
                duration = args.exploration_fraction * (args.total_timesteps - args.learning_starts)
                epsilon = linear_schedule(args.start_e, args.end_e, duration, global_step - args.learning_starts)
            explore = (random.random() < epsilon)

        # Find the next action to play
        if explore:
            action = env.action_space.sample()
        else:
            q_values = q_network(obs)
            action = int(torch.argmax(q_values))
        
        # Play a step with the given action
        next_obs, reward, done, info = env.step(action)

        if not done:
            # Add observation to replay buffer
            data = TensorDict({"obs" : obs,
                               "next_obs" : next_obs,
                               "action" : [action],
                               "reward" : [reward],
                               "done" : [int(done)]},
                               batch_size=1, device=device)
            rb.add(data)
            obs = next_obs
        else:
            print(f"Episode {episode_cnt} completed: {prefix}global_step={global_step},\tepisodic_return={info.get('episode')['r']:.1f},\tscore={info.get('score')}")
            write_scalars(writer, global_step, info, epsilon)

            episode_cnt += 1

            scores.append(info.get("score"))
            if episode_cnt > args.mean_score_count:
                mean_score = sum(scores) / args.mean_score_count
                if mean_score > best_mean_score:
                    best_mean_score = mean_score
                    if global_step > args.learning_starts:
                        print(f"New best mean score: {mean_score}")
                        # Keep a backup of the best scoring model
                        best_model_path = f"runs/{run_name}/{args.exp_name}.best"
                        torch.save(q_network.state_dict(), best_model_path)

            if args.capture_inputs_video:
                if episode_cnt % args.video_frequency == 0:
                    out = cv2.VideoWriter(f'episode{episode_cnt}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)
                else:
                    out = None
            
            if episode_cnt % args.reload_env_frequency == 0:
                num_reloads = episode_cnt // args.reload_env_frequency
                del env
                env = get_env(args.env_id, args.seed, args.capture_video, f"{run_name}_{num_reloads}", args.video_frequency, args.frame_stack, device=device)
            
            obs = env.reset()

        # Training Logic
        if global_step > args.learning_starts:
            if global_step % args.train_frequency == 0:
                data = rb.sample()
                
                with torch.no_grad():
                    target_max, _ = target_network(data.get("next_obs")).max(dim=1)
                    td_target = data.get("reward").flatten() + args.gamma * target_max * (1 - data.get("done").flatten())
                old_val = q_network(data.get("obs")).gather(1, data.get("action")).squeeze()
                # loss = F.mse_loss(old_val, td_target)

                # Not sure about this part
                def weighted_mse_loss(input, target, weight):
                    return torch.sum(weight * (input - target) ** 2) 

                weights = data.get("_weight")
                weights /= torch.sum(weights)
                loss = weighted_mse_loss(old_val, td_target, weights)

                # optimize the model
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                # Update data priority
                td_error = torch.abs(old_val - td_target).unsqueeze(1)
                data.set("td_error", td_error)
                rb.update_tensordict_priority(data)

                # Log training statistics
                if global_step % 100 == 0:
                    writer.add_scalar("losses/td_loss", loss, global_step)
                    writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)

            # update target network
            if global_step % args.target_network_frequency == 0:
                for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
                    target_network_param.data.copy_(
                        args.tau * q_network_param.data + (1.0 - args.tau) * target_network_param.data
                    )

            if global_step % args.backup_frequency == 0:
                best_model_path = f"runs/{run_name}/{args.exp_name}.backup"
                torch.save(q_network.state_dict(), best_model_path)

    if args.save_model:
        model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
        torch.save(q_network.state_dict(), model_path)
        print(f"{prefix}model saved to {model_path}")

        scores = evaluate(
            q_network,
            args.env_id,
            args.eval_episodes,
            run_name=f"{run_name}-eval",
            seed=args.seed,
            device=device,
            capture_video=args.capture_video,
            frame_stack=args.frame_stack
        )

        print("Eval Scores:", scores)
        
    env.close()
    writer.close()

## Main

In [15]:
class Args:
    def __init__(self):
        # Settings
        self.exp_name = "Tetris_DQN"
        self.run_id = int(time.time())
        self.torch_deterministic = True
        self.cuda = True
        self.mps = False
        self.capture_video = True
        self.capture_inputs_video = True
        self.save_model = True
        self.eval_episodes = 1
        self.backup_frequency = 10000
        self.mean_score_count = 50
        self.video_frequency = 50
        self.reload_env_frequency = 49
        self.prefetch=None

        # Hyper-Parameters
        self.env_id = "TetrisA-v5"
        self.frame_stack = 6
        self.seed = 2
        self.total_timesteps = 500_000
        self.learning_rate = 2e-4
        self.buffer_size = 20_000
        self.learning_starts = 20_000
        self.train_frequency = 1
        self.gamma = 0.99
        self.tau = 0.999
        self.alpha = 0.5
        self.beta = 0.5
        self.target_network_frequency = 1000
        self.batch_size = 32
        self.start_e = 1
        self.end_e = 0.1
        self.exploration_fraction = 0.2

args = Args()

In [16]:
!rm -r runs/* videos/* model_input_videos/* episode*.mp4

rm: cannot remove 'model_input_videos/*': No such file or directory


In [17]:
# prev_model_path = f"/mnt/c/Users/User/OneDrive/Documents/GitHub/AtariGameMaster/ImportantMilestones/Squares_only_binary_board/model"
# train(args, start_model_path=prev_model_path)

In [18]:
train(args)

device_name: cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                 [32, 1200]               0
            Linear-2                 [32, 1024]       1,229,824
              ReLU-3                 [32, 1024]               0
            Linear-4                  [32, 512]         524,800
              ReLU-5                  [32, 512]               0
            Linear-6                    [32, 6]           3,078
Total params: 1,757,702
Trainable params: 1,757,702
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.15
Forward/backward pass size (MB): 1.04
Params size (MB): 6.71
Estimated Total Size (MB): 7.90
----------------------------------------------------------------
Episode 0 completed: global_step=244,	episodic_return=-55.7,	score=0
Episode 1 completed: global_step=422,	episodic_return=-69.2,	score=0
Episode 2 

TypeError: get_env() got multiple values for argument 'device'

In [None]:
device = 'cuda'
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{args.run_id}"
model_path = f"runs/{run_name}/{args.exp_name}.best"
state_dict = torch.load(model_path)
net = QNetwork(len(SIMPLE_MOVEMENT), args.frame_stack).to(device)
net.load_state_dict(state_dict)

In [None]:
score = evaluate(net, args.env_id, 1, f'{run_name}2-eval', 2, device, True, 1, args.frame_stack)
# score = evaluate(net, args.env_id, 1, '', 20, device, True, 1, args.frame_stack, 0.1)

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs