# Installs

In [None]:
!pip install tensorboardX
!pip install pyglet==1.5.1
!pip install torchsummary
!pip install optuna
!pip install optuna-dashboard
!pip install torchrl
!pip install setuptools==65.5.1
!pip install gym==0.21.0
!pip install stable-baselines3[extra]
!pip install lz4
!sudo apt-get install -y xvfb
!pip install pyvirtualdisplay

In [None]:
!nvidia-smi

# Imports

In [1]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1024, 768))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f5234236eb0>

In [2]:
import sys
import os

# Get the absolute path to the parent directory of gym-tetris
gym_tetris_parent_path = os.path.abspath(os.path.join('..', 'gym-tetris'))

# Append the path to the sys.path
sys.path.append(gym_tetris_parent_path)

In [3]:
import random
import time
from distutils.util import strtobool

from gym import Wrapper, ObservationWrapper
from gym.wrappers import RecordEpisodeStatistics, RecordVideo, FrameStack
from gym.spaces import Box, Discrete

from nes_py.wrappers import JoypadSpace
from gym_tetris.actions import SIMPLE_MOVEMENT
from gym_tetris.tetris_env import TetrisEnv

import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

# from stable_baselines3.common.buffers import ReplayBuffer

from tensordict import TensorDict
from torchrl.data import TensorDictPrioritizedReplayBuffer, LazyTensorStorage
# from torchrl.data import PrioritizedReplayBuffer, ListStorage, LazyMemmapStorage

from torchsummary import summary
from collections import deque



In [4]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

# Model

In [5]:
class QNetwork(nn.Module):
    def __init__(self, net_fn, num_actions, frame_stack):
        super().__init__()
        self.network = net_fn(num_actions, frame_stack)

    def forward(self, x):
        return self.network(x)

In [6]:
# Small network
def small(num_actions, frame_stack):
    return  nn.Sequential(
            # (frame_stack, 20, 10)
            nn.Flatten(),
            # 200 x frame_stack
            nn.Linear(200*frame_stack, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions),
        )

In [7]:
# Large network
def large(num_actions, frame_stack):
    return  nn.Sequential(
            # (frame_stack, 20, 10)
            nn.Flatten(),
            # 200 x frame_stack
            nn.Linear(200*frame_stack, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions),
        )

In [8]:
# Convolutional network
def conv(num_actions, frame_stack):
    return  nn.Sequential(
            # (frame_stack, 20, 10)
            nn.Conv2d(frame_stack, 16, 3, stride=1),
            nn.ReLU(),
            # (16, 18, 8)
            nn.Conv2d(16, 32, 3, stride=1),
            nn.ReLU(),
            # (32, 16, 6)
            nn.Conv2d(32, 32, 2, stride=1),
            nn.ReLU(),
            # (32, 15, 5)
            nn.Flatten(),
            # 2400
            nn.Linear(2400, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

In [9]:
def get_model_class(model_name="small"):
    if model_name == "small":
        class SmallQNetwork(QNetwork):
            def __init__(self, num_actions, frame_stack):
                super().__init__(small, num_actions, frame_stack)
        return SmallQNetwork
    elif model_name == "large":
        class LargeQNetwork(QNetwork):
            def __init__(self, num_actions, frame_stack):
                super().__init__(large, num_actions, frame_stack)
        return LargeQNetwork
    elif model_name == "conv":
        class ConvQNetwork(QNetwork):
            def __init__(self, num_actions, frame_stack):
                super().__init__(conv, num_actions, frame_stack)
        return ConvQNetwork
    else:
        print("Not a valid architecture")


# Environment

In [10]:
# DEVICE_NAME = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE_NAME = "cpu"
DEVICE = torch.device(DEVICE_NAME)

In [11]:
# Frame Skip
class FrameSkipEnv(Wrapper):
    def __init__(self, env=None, skip=4):
        super(FrameSkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

    def reset(self):
        obs = self.env.reset()
        return obs

In [12]:
# Board Constants
GAME_BOX = 47, 95, 209, 176
BOARD_SHAPE = 20, 10
y_step = (GAME_BOX[2] - GAME_BOX[0]) // BOARD_SHAPE[0]
x_step = (GAME_BOX[3] - GAME_BOX[1]) // BOARD_SHAPE[1]

In [13]:
# Binary Board
class BinaryBoard(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(0, 1, BOARD_SHAPE)

    def observation(self, obs):
        # Given an image of the current board, obtain a binary (20x10) representation
        gray = np.mean(obs, axis=-1)
        cropped = gray[GAME_BOX[0]+(y_step//2) : GAME_BOX[2] : y_step,
                       GAME_BOX[1]+(x_step//2) : GAME_BOX[3] : x_step]
        assert cropped.shape == BOARD_SHAPE
        cropped[cropped > 1] = 1
        return cropped

In [14]:
# Tensor Wrapper
class TensorWrapper(ObservationWrapper):
    def __init__(self, env=None):
        super(TensorWrapper, self).__init__(env)
    
    def observation(self, obs):
        np_obs = np.array([obs])
        return torch.Tensor(np_obs).to(DEVICE)

In [15]:
# Get Environment
FRAME_SKIP = 6
# Making an environment
def get_env(args, run_name : str = "run"):
    env = TetrisEnv(
        line_weight=args.line_weight,
        height_weight=args.height_weight,
        cost_weight=args.cost_weight,
        holes_weight=args.holes_weight,
        bumpiness_weight=args.bumpiness_weight,
        col_transitions_weight=args.col_transitions_weight,
        row_transitions_weight=args.row_transitions_weight,
    )

    env = RecordEpisodeStatistics(env)
    if args.capture_video:
        env = RecordVideo(env, f"videos/{run_name}", episode_trigger=lambda ep_num: ep_num % args.video_frequency == 0)
    
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = FrameSkipEnv(env, skip=FRAME_SKIP)
    env = BinaryBoard(env)
    env = FrameStack(env, args.frame_stack)
    env = TensorWrapper(env)

    env.seed(args.seed)
    env.action_space.seed(args.seed)
    env.observation_space.seed(args.seed)
    return env

# Training

In [16]:
# Epsilon scheduling
def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

In [17]:
# Video Constants
FPS = 60 / FRAME_SKIP
SCALE_UP = 10

In [18]:
# Write the current observation into the video
def write_video_frame(out, obs, frame_stack : int = 1, scale_up : int = 1):
    img = np.array(obs.cpu(), dtype='uint8')[0] * 255
    if frame_stack > 1:
        img = img[-1]
    if scale_up > 1:
        img = np.repeat(np.repeat(img, scale_up, axis=0), scale_up, axis=1)
    out.write(img)

In [19]:
# Evaluation
def evaluate(args, model: torch.nn.Module, eval_name : str = "eval", no_video : bool = False):
    env = get_env(args, run_name=eval_name) 
    
    model.eval()

    total_lines = 0.0
    for episode in range(args.eval_episodes):
        if args.capture_video and not no_video:
            out = cv2.VideoWriter(f'eval_episode{episode}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)
        
        obs = env.reset()
        done = False
        while not done:
            if args.capture_video and not no_video:
                write_video_frame(out, obs, args.frame_stack, SCALE_UP)

            q_values = model(obs)
            action = int(torch.argmax(q_values))
            obs, _, done, info = env.step(action)
        
        print(f"eval_episode={episode}, lines={info.get('lines')}, episodic_return={info.get('episode')['r']}")
        total_lines += info.get('lines')
    env.close()
    mean_lines = total_lines / args.eval_episodes
    return mean_lines

In [20]:
def write_episode_scalars(writer, global_step, info, epsilon):
    writer.add_scalar("charts/episodic_return", info.get("episode")["r"], global_step)
    writer.add_scalar("charts/episodic_length", info.get("episode")["l"], global_step)
    writer.add_scalar("charts/epsilon", epsilon, global_step)
    # writer.add_scalar("charts/score", info.get("score"), global_step)
    writer.add_scalar("charts/lines", info.get("lines"), global_step)

In [21]:
# Single env training without optuna - for simplicity
def train(args, start_model_path=None, trial=None):
  try:
    run_name = f"{args.exp_name}__{args.seed}__{args.run_id}"
    prefix = ""

    if trial:
      run_name += f"_trial_{trial.number}"
      prefix = f"trial_{trial.number}: "
    
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

    # TRY NOT TO MODIFY: seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    # env setup
    env = get_env(args, run_name=f"{run_name}_0")
    assert isinstance(env.action_space, Discrete), "only discrete action space is supported"

    q_network = args.model(env.action_space.n, args.frame_stack).to(DEVICE)
    if start_model_path is not None:
        state_dict = torch.load(start_model_path)
        q_network.load_state_dict(state_dict)
    
    optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
    target_network = args.model(env.action_space.n, args.frame_stack).to(DEVICE)
    target_network.load_state_dict(q_network.state_dict())

    # summary(q_network, input_size=(args.frame_stack, *BOARD_SHAPE), batch_size=args.batch_size, device=device_name)

    rb = TensorDictPrioritizedReplayBuffer(
        alpha=args.alpha,
        beta=args.beta,
        storage=LazyTensorStorage(args.buffer_size, device=DEVICE),
        batch_size=args.batch_size,
        prefetch=args.prefetch
    )

    obs = env.reset()

    # Tracks number of episodes simulated
    episode_cnt = 0
    # Tracks the number of pieces we have played
    piece_count = 0
    # Whether we explore (play random moves) or exploit (play according to the model)
    explore = True
    info = None

    # Track the best scoring models
    episode_lines = deque(maxlen=args.mean_lines_count)
    best_mean_lines = -1.0

    eval_idx = 1

    if args.capture_inputs_video:
        out = cv2.VideoWriter(f'episode0.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)

    sps_time = time.time()

    for global_step in range(args.total_timesteps):

        if global_step > 0 and global_step % 1000 == 0:
           curr_time = time.time()
           writer.add_scalar("charts/SPS", 1000 / (curr_time - sps_time), global_step)
           sps_time = curr_time
        
        if args.capture_inputs_video and (episode_cnt % args.video_frequency == 0):
            write_video_frame(out, obs, args.frame_stack, SCALE_UP)
        
        # If a new piece has been generated, decide wether we will explore or exploit for this piece
        if global_step > 0 and piece_count != info.get("piece_count"):
            piece_count = info.get("piece_count")
            if global_step < args.learning_starts:
                epsilon = args.start_e
            else:
                duration = args.exploration_fraction * (args.total_timesteps - args.learning_starts)
                epsilon = linear_schedule(args.start_e, args.end_e, duration, global_step - args.learning_starts)
            explore = (random.random() < epsilon)

        # Find the next action to play
        if explore:
            action = env.action_space.sample()
        else:
            q_values = q_network(obs)
            action = int(torch.argmax(q_values))
        
        # Play a step with the given action
        next_obs, reward, done, info = env.step(action)

        # Evaluate and report the agent periodically
        if trial and global_step > 0 and global_step % args.eval_frequency == 0 and global_step < (args.total_timesteps - args.total_evaluations):
            no_video = not (args.eval_video_frequency % eval_idx == 0)
            eval_mean_lines = evaluate(args, model=q_network, eval_name=f"{run_name}-eval-{eval_idx}", no_video=no_video)
            print(f"{prefix}evaluation_{eval_idx} mean_lines={eval_mean_lines}")
            trial.set_user_attr("mean_lines", eval_mean_lines)
            eval_idx += 1

            # Check if the trial should be pruned
            if trial and trial.should_prune():
                print(f"Pruning Run: {run_name}")
                raise optuna.exceptions.TrialPruned()

        if not done:
            # Add observation to replay buffer
            data = TensorDict({"obs" : obs,
                               "next_obs" : next_obs,
                               "action" : [action],
                               "reward" : [reward],
                               "done" : [int(done)]},
                               batch_size=1, device=DEVICE)
            rb.add(data)
            obs = next_obs
        else:
            # print(f"Episode {episode_cnt} completed: {prefix}global_step={global_step},\tepisodic_return={info.get('episode')['r']:.1f},\tscore={info.get('score')}")
            # print(f"Episode {episode_cnt} completed: {prefix}global_step={global_step},\tepisodic_return={info.get('episode')['r']:.1f},\tLines_Cleared={info.get('lines')}")
            write_episode_scalars(writer, global_step, info, epsilon)

            episode_cnt += 1

            episode_lines.append(info.get("lines"))
            if episode_cnt > args.mean_lines_count:
                curr_mean_lines = sum(episode_lines) / args.mean_lines_count
                if curr_mean_lines > best_mean_lines:
                    best_mean_lines = curr_mean_lines
                    if global_step > args.learning_starts:
                        print(f"New best mean lines: {curr_mean_lines}")
                        # Keep a backup of the best scoring model
                        best_model_path = f"runs/{run_name}/{args.exp_name}.best"
                        torch.save(q_network.state_dict(), best_model_path)

            if args.capture_inputs_video:
                if episode_cnt % args.video_frequency == 0:
                    out = cv2.VideoWriter(f'episode{episode_cnt}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (BOARD_SHAPE[1]*SCALE_UP, BOARD_SHAPE[0]*SCALE_UP), False)
                else:
                    out = None
            
            if episode_cnt % args.reload_env_frequency == 0:
                num_reloads = episode_cnt // args.reload_env_frequency
                del env
                env = get_env(args, run_name=f"{run_name}_{num_reloads}")
            
            obs = env.reset()

        # Training Logic
        if global_step > args.learning_starts:
            
            if global_step % args.train_frequency == 0:
                data = rb.sample()
                with torch.no_grad():
                    target_max, _ = target_network(torch.squeeze(data.get("next_obs"))).max(dim=1)
                    td_target = data.get("reward").flatten() + args.gamma * target_max * (1 - data.get("done").flatten())
                model_output = q_network(torch.squeeze(data.get("obs"))).gather(1, data.get("action")).squeeze()
                
                loss = F.mse_loss(model_output, td_target)

                # Not sure about this part
                # weights = data.get("_weight")
                # loss = torch.sum(weights * torch.square(old_val - td_target)) / torch.sum(weights)

                # optimize the model
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                # Update data priority
                td_error = torch.abs(model_output - td_target).unsqueeze(1)
                data.set("td_error", td_error)
                rb.update_tensordict_priority(data)

                # Log training statistics
                if global_step % (100 * args.train_frequency) == 0:
                    writer.add_scalar("losses/td_loss", loss, global_step)
                    writer.add_scalar("losses/q_values", model_output.mean().item(), global_step)

            # update target network
            if global_step % args.target_network_frequency == 0:
                for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
                    target_network_param.data.copy_(
                        args.tau * q_network_param.data + (1.0 - args.tau) * target_network_param.data
                    )

            if global_step % args.backup_frequency == 0:
                backup_model_path = f"runs/{run_name}/{args.exp_name}.backup"
                torch.save(q_network.state_dict(), backup_model_path)

    if args.save_model:
        final_model_path = f"runs/{run_name}/{args.exp_name}.final"
        torch.save(q_network.state_dict(), final_model_path)
        print(f"{prefix}model saved to {final_model_path}")

        final_mean_lines = evaluate(
            args,
            model=q_network,
            eval_name=f"{run_name}-eval"
        )

        final_mean_lines = np.mean(final_mean_lines)
        print(f"{prefix}evaluation_{eval_idx}: mean_lines={final_mean_lines}")
        if trial:
            trial.set_user_attr("mean_lines", float(final_mean_lines))
        
  except:
    raise
  finally:
    env.close()
    writer.close()

# Optuna

In [22]:
# Optuna constants
N_TRIALS = 100              # Maximum number of trials
N_TIMESTEPS = 500_000       # Maximum number of time steps per trial
N_JOBS = 4                  # Number of jobs to run in parallel
N_STARTUP_TRIALS = 4        # Stop random sampling after N_STARTUP_TRIALS

In [23]:
class Args:
    def __init__(self):
        # Settings
        self.exp_name = "Tetris_DQN"
        self.run_id = int(time.time())
        self.torch_deterministic = True
        self.capture_video = True
        self.capture_inputs_video = True
        self.save_model = True
        self.backup_frequency = 50000
        self.mean_lines_count = 50
        self.video_frequency = 100
        self.eval_video_frequency = 5
        self.reload_env_frequency = 49
        self.prefetch = 3

        # Constant Hyper-Parameters
        self.seed = 2
        self.total_timesteps = N_TIMESTEPS
        self.buffer_size = 40_000
        self.learning_starts = 20_000
        self.train_frequency = 1
        self.start_e = 1.0
        self.batch_size = 32

        # Evaluation
        self.total_evaluations = 3
        self.eval_episodes = 5
        self.eval_frequency = self.total_timesteps // self.total_evaluations

        # Optimizable Hyper-Parameters
        self.model = None
        self.learning_rate = None
        self.gamma = None
        self.tau = None
        self.alpha = None
        self.beta = None
        self.frame_stack = None
        self.target_network_frequency = None
        self.end_e = None
        self.exploration_fraction = None

        # Reward weights
        self.line_weight = None
        self.height_weight = None
        self.cost_weight = None
        self.holes_weight = None
        self.bumpiness_weight = None
        self.col_transitions_weight = None
        self.row_transitions_weight = None

args = Args()

In [24]:
def sample_params(trial: optuna.Trial) -> dict:
    params = {
        "model" : get_model_class(trial.suggest_categorical("model", ["small", "large", "conv"])),
        "learning_rate" : trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "gamma" : 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True),
        "tau" : 1.0 - trial.suggest_float("tau", 0.00001, 0.1, log=True),
        "alpha" : trial.suggest_float("alpha", 0, 1),
        "beta" : trial.suggest_float("beta", 0, 1),
        "frame_stack" : trial.suggest_int("frame_stack", 2, 6),
        "target_network_frequency" : trial.suggest_int("target_network_frequency", 100, 10000, log=True),
        "end_e" : trial.suggest_float("end_e", 0, 0.3),
        "exploration_fraction" : trial.suggest_float("exploration_fraction", 0.1, 0.5),
        "line_weight" : trial.suggest_float("line_weight", 0, 10),
        "height_weight" : trial.suggest_float("height_weight", 0, 2),
        "cost_weight" : trial.suggest_float("cost_weight", 0, 2),
        "holes_weight" : trial.suggest_float("holes_weight", 0, 2),
        "bumpiness_weight" : trial.suggest_float("bumpiness_weight", 0, 2),
        "col_transitions_weight" : trial.suggest_float("col_transitions_weight", 0, 2),
        "row_transitions_weight" : trial.suggest_float("row_transitions_weight", 0, 2),
    }
    return params

In [25]:
def objective(trial: optuna.Trial):
    args = Args()
    hyperparameters = sample_params(trial)
    for key, value in hyperparameters.items():
        setattr(args, key, value)

    nan_encountered = False
    try:
      train(args, trial=trial)
    except AssertionError as e:
      # Sometimes, random hyperparams can generate NaN
      print(e)
      nan_encountered = True
    except optuna.exceptions.TrialPruned:
      raise

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    return trial.user_attrs["mean_lines"]

# Main

In [26]:
!rm -r runs/* videos/* *.mp4 db.sqlite3

In [27]:
study_name = f"{args.exp_name}_study"
study_num = 1

In [28]:
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Pruner to stop bad runs
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(study_name=f"{study_name}-{study_num}", storage="sqlite:///db.sqlite3", sampler=sampler, pruner=pruner, direction="maximize")
study_num += 1

[32m[I 2023-07-25 23:45:58,019][0m A new study created in RDB with name: Tetris_DQN_study-1[0m


In [29]:
# !optuna-dashboard sqlite:///db.sqlite3

In [30]:
try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS)
except KeyboardInterrupt:
    pass

New best mean lines: 0.12
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0.2
New best mean lines: 0.28
New best mean lines: 0.32
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0.28
New best mean lines: 0.36
New best mean lines: 0.12
New best mean lines: 0.4
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0.28
New best mean lines: 0.32
New best mean lines: 0.24
New best mean lines: 0.28
New best mean lines: 0.32
New best mean lines: 0.36
New best mean lines: 0.4
New best mean lines: 0.36
New best mean lines: 0.4
New best mean lines: 0.44
eval_episode=0, lines=4, episodic_return=32.79688262939453
eval_episode=1, lines=4, episodic_return=32.79688262939453
eval_episode=2, lines=4, episodic_return=32.79688262939453
eval_episode=3, lines=4, episodic_return=32.796882

[32m[I 2023-07-26 07:26:43,081][0m Trial 3 finished with value: 102.0 and parameters: {'model': 'small', 'learning_rate': 2.887920039660972e-05, 'gamma': 0.02067053271259168, 'tau': 4.871575614259326e-05, 'alpha': 0.34314878574797547, 'beta': 0.24197139447292437, 'frame_stack': 4, 'target_network_frequency': 2885, 'end_e': 0.0318999662303191, 'exploration_fraction': 0.33727999228700345, 'line_weight': 9.434291019417316, 'height_weight': 0.4043195385597811, 'cost_weight': 0.5385294106360292, 'holes_weight': 1.5710494042498981, 'bumpiness_weight': 1.9861131318856091, 'col_transitions_weight': 1.9619240527157016, 'row_transitions_weight': 0.6697395904108474}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=102, episodic_return=955.8701782226562
trial_3: evaluation_3: mean_lines=102.0
eval_episode=0, lines=66, episodic_return=232.68582153320312
eval_episode=1, lines=66, episodic_return=232.68582153320312
eval_episode=2, lines=66, episodic_return=232.68582153320312
eval_episode=3, lines=66, episodic_return=232.68582153320312
eval_episode=4, lines=66, episodic_return=232.68582153320312
trial_2: evaluation_2 mean_lines=66.0
eval_episode=0, lines=0, episodic_return=-3.6883459091186523
eval_episode=1, lines=0, episodic_return=-3.6883459091186523
eval_episode=2, lines=0, episodic_return=-3.6883459091186523
eval_episode=3, lines=0, episodic_return=-3.6883459091186523
eval_episode=4, lines=0, episodic_return=-3.6883459091186523
trial_0: evaluation_2 mean_lines=0.0
New best mean lines: 0.88
New best mean lines: 0.96
New best mean lines: 1.0
New best mean lines: 1.08
New best mean lines: 1.12
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 1.16
New bes

[32m[I 2023-07-26 10:25:57,773][0m Trial 1 finished with value: 0.0 and parameters: {'model': 'conv', 'learning_rate': 1.0152964287124312e-05, 'gamma': 0.0010279650890819665, 'tau': 2.313470463875556e-05, 'alpha': 0.9154544971408209, 'beta': 0.2806842232345935, 'frame_stack': 6, 'target_network_frequency': 255, 'end_e': 0.06460811117491377, 'exploration_fraction': 0.3505031859179879, 'line_weight': 9.766461238894937, 'height_weight': 0.3355843026146743, 'cost_weight': 0.21932721879151185, 'holes_weight': 1.606768867089508, 'bumpiness_weight': 1.4190504452646442, 'col_transitions_weight': 0.8131147352011192, 'row_transitions_weight': 0.5304582634125297}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-3.8482847213745117
trial_1: evaluation_3: mean_lines=0.0
New best mean lines: 2.32
New best mean lines: 2.36
New best mean lines: 2.44
New best mean lines: 0.32
New best mean lines: 2.48
New best mean lines: 2.52
New best mean lines: 2.76
New best mean lines: 2.88
New best mean lines: 0.36
New best mean lines: 2.92
New best mean lines: 0.44
New best mean lines: 2.96
New best mean lines: 0.48
New best mean lines: 0.52
New best mean lines: 0.56
New best mean lines: 0.6
New best mean lines: 3.0
New best mean lines: 3.04
New best mean lines: 3.12
New best mean lines: 3.16
New best mean lines: 3.28
New best mean lines: 3.56
New best mean lines: 3.64
New best mean lines: 3.92
New best mean lines: 3.96
trial_2: model saved to runs/Tetris_DQN__2__1690317958_trial_2/Tetris_DQN.final
eval_episode=0, lines=92, episodic_return=328.5112609863281
eval_episode=1, lines=92, episodic_return=328.5112609863281
eval_episode=2, lines=92, episodic_return=328.5112609

[32m[I 2023-07-26 12:54:46,294][0m Trial 0 finished with value: 0.0 and parameters: {'model': 'conv', 'learning_rate': 0.00013784480217820275, 'gamma': 0.0010113675887214037, 'tau': 1.9860463402629142e-05, 'alpha': 0.6409603753825729, 'beta': 0.6062558920493321, 'frame_stack': 4, 'target_network_frequency': 248, 'end_e': 0.2964182813014932, 'exploration_fraction': 0.13400470198077608, 'line_weight': 1.171862653372392, 'height_weight': 1.2072684599424157, 'cost_weight': 0.3568261277740665, 'holes_weight': 0.5722454022389756, 'bumpiness_weight': 1.4617589227721055, 'col_transitions_weight': 1.2734276239040834, 'row_transitions_weight': 0.12870705285388717}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-3.676870107650757
trial_0: evaluation_3: mean_lines=0.0


[32m[I 2023-07-26 12:55:27,057][0m Trial 2 finished with value: 92.0 and parameters: {'model': 'conv', 'learning_rate': 0.00019564106452349243, 'gamma': 0.0004181185607080875, 'tau': 0.03841229760011666, 'alpha': 0.5615570516775242, 'beta': 0.5047195007954445, 'frame_stack': 6, 'target_network_frequency': 4222, 'end_e': 0.1208032721427813, 'exploration_fraction': 0.1591065714764245, 'line_weight': 3.6504459190279794, 'height_weight': 0.5163222288766869, 'cost_weight': 1.568621417676313, 'holes_weight': 0.9427121899247681, 'bumpiness_weight': 0.16832677218020242, 'col_transitions_weight': 1.6958812280584172, 'row_transitions_weight': 1.4190117650466476}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=92, episodic_return=328.5112609863281
trial_2: evaluation_3: mean_lines=92.0
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 0.64
New best mean lines: 0.68
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0.72
New best mean lines: 0.76
New best mean lines: 0.8
New best mean lines: 0.84
New best mean lines: 0.88
eval_episode=0, lines=0, episodic_return=-5.251335620880127
eval_episode=1, lines=0, episodic_return=-5.251335620880127
eval_episode=2, lines=0, episodic_return=-5.251335620880127
eval_episode=3, lines=0, episodic_return=-5.251335620880127
eval_episode=4, lines=0, episodic_return=-5.251335620880127
trial_4: evaluation_2 mean_lines=0.0
eval_episode=0, lines=0, episodic_return=-9.820008277893066
eval_episode=1, lines=0, episodic_return=-9.820008277893066
eval_episode=2, lines=0, episodic_return=-9.820008277893066
eval_episode=3, lines=0, episodic_return=-9.820008277893066
eval_episode=4, lines=0, episodic_return

[32m[I 2023-07-26 17:11:27,877][0m Trial 4 finished with value: 2.0 and parameters: {'model': 'large', 'learning_rate': 2.6804426355075637e-05, 'gamma': 0.00019639768319536048, 'tau': 0.0927225690795834, 'alpha': 0.8365052807897753, 'beta': 0.7228952057372173, 'frame_stack': 2, 'target_network_frequency': 3823, 'end_e': 0.22049002265567355, 'exploration_fraction': 0.251347344067007, 'line_weight': 7.1506113693866515, 'height_weight': 0.542355051101679, 'cost_weight': 1.7377176187204055, 'holes_weight': 0.2937981352726309, 'bumpiness_weight': 1.0087357555811138, 'col_transitions_weight': 0.5502262049229707, 'row_transitions_weight': 1.0333069831702375}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=2, episodic_return=9.522710800170898
trial_4: evaluation_3: mean_lines=2.0
eval_episode=0, lines=0, episodic_return=-4.75391149520874
eval_episode=1, lines=0, episodic_return=-4.75391149520874
eval_episode=2, lines=0, episodic_return=-4.75391149520874
eval_episode=3, lines=0, episodic_return=-4.75391149520874
eval_episode=4, lines=0, episodic_return=-4.75391149520874
trial_6: evaluation_1 mean_lines=0.0
New best mean lines: 0.2
eval_episode=0, lines=0, episodic_return=-8.234596252441406
eval_episode=1, lines=0, episodic_return=-8.234596252441406
eval_episode=2, lines=0, episodic_return=-8.234596252441406
eval_episode=3, lines=0, episodic_return=-8.234596252441406
eval_episode=4, lines=0, episodic_return=-8.234596252441406
trial_5: evaluation_2 mean_lines=0.0
New best mean lines: 0.24
New best mean lines: 0.28
New best mean lines: 0.32
New best mean lines: 0.36
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 0.88
New best mean lines: 0.96
N

[32m[I 2023-07-26 22:32:17,181][0m Trial 5 finished with value: 2.0 and parameters: {'model': 'conv', 'learning_rate': 7.932094509577086e-05, 'gamma': 0.00011945767761306156, 'tau': 0.02866902633683626, 'alpha': 0.42655836982711026, 'beta': 0.09322475981318601, 'frame_stack': 4, 'target_network_frequency': 1822, 'end_e': 0.09762443050328262, 'exploration_fraction': 0.35503837357474166, 'line_weight': 5.586033375691892, 'height_weight': 1.6667891536634674, 'cost_weight': 0.7098316087606091, 'holes_weight': 0.008910669752089984, 'bumpiness_weight': 0.2680182364950545, 'col_transitions_weight': 1.413825071493762, 'row_transitions_weight': 1.7065353584570941}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=2, episodic_return=3.5788350105285645
trial_5: evaluation_3: mean_lines=2.0
New best mean lines: 0.64
trial_7: model saved to runs/Tetris_DQN__2__1690365327_trial_7/Tetris_DQN.final
eval_episode=0, lines=0, episodic_return=-6.718338489532471
eval_episode=1, lines=0, episodic_return=-6.718338489532471
eval_episode=2, lines=0, episodic_return=-6.718338489532471
eval_episode=3, lines=0, episodic_return=-6.718338489532471


[32m[I 2023-07-26 23:33:39,264][0m Trial 7 finished with value: 0.0 and parameters: {'model': 'small', 'learning_rate': 0.0009974087196817788, 'gamma': 0.0477439248437215, 'tau': 0.00037611565863453195, 'alpha': 0.016315947105042916, 'beta': 0.028661631157271605, 'frame_stack': 2, 'target_network_frequency': 5603, 'end_e': 0.004954029003851983, 'exploration_fraction': 0.4892916377124914, 'line_weight': 8.712602666980287, 'height_weight': 1.975544615699815, 'cost_weight': 0.9605020683386819, 'holes_weight': 1.9480380851530712, 'bumpiness_weight': 1.9985586983892614, 'col_transitions_weight': 1.9676536306622536, 'row_transitions_weight': 0.9788850086784461}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-6.718338489532471
trial_7: evaluation_3: mean_lines=0.0
eval_episode=0, lines=0, episodic_return=-5.817572116851807
eval_episode=1, lines=0, episodic_return=-5.817572116851807
eval_episode=2, lines=0, episodic_return=-5.817572116851807
eval_episode=3, lines=0, episodic_return=-5.817572116851807
eval_episode=4, lines=0, episodic_return=-5.817572116851807
trial_6: evaluation_2 mean_lines=0.0
New best mean lines: 0.12
eval_episode=0, lines=0, episodic_return=-7.413454055786133
eval_episode=1, lines=0, episodic_return=-7.413454055786133
eval_episode=2, lines=0, episodic_return=-7.413454055786133
eval_episode=3, lines=0, episodic_return=-7.413454055786133
eval_episode=4, lines=0, episodic_return=-7.413454055786133
trial_8: evaluation_2 mean_lines=0.0
New best mean lines: 0.12
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
New best mean lines: 0

[32m[I 2023-07-27 04:32:28,410][0m Trial 8 finished with value: 0.0 and parameters: {'model': 'small', 'learning_rate': 0.0009890621544082013, 'gamma': 0.03562877758025787, 'tau': 0.0003493350978773063, 'alpha': 0.11749431756242523, 'beta': 0.04599216230178971, 'frame_stack': 4, 'target_network_frequency': 1162, 'end_e': 2.57159717292621e-05, 'exploration_fraction': 0.4795656737007344, 'line_weight': 9.903806301521076, 'height_weight': 1.7578333364759453, 'cost_weight': 0.8276874685433331, 'holes_weight': 1.8440444724448615, 'bumpiness_weight': 1.9674070331971183, 'col_transitions_weight': 1.9127056825731392, 'row_transitions_weight': 0.7594246912096653}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-5.894560813903809
trial_8: evaluation_3: mean_lines=0.0
New best mean lines: 70.44
New best mean lines: 73.28
New best mean lines: 73.96
New best mean lines: 76.2
New best mean lines: 78.52
New best mean lines: 79.48
New best mean lines: 80.4
New best mean lines: 81.84
eval_episode=0, lines=0, episodic_return=-4.3083271980285645
eval_episode=1, lines=0, episodic_return=-4.3083271980285645
eval_episode=2, lines=0, episodic_return=-4.3083271980285645
eval_episode=3, lines=0, episodic_return=-4.3083271980285645
eval_episode=4, lines=0, episodic_return=-4.3083271980285645
trial_9: evaluation_2 mean_lines=0.0
New best mean lines: 84.08
New best mean lines: 0.12
New best mean lines: 84.88
New best mean lines: 85.0
New best mean lines: 85.08
trial_10: model saved to runs/Tetris_DQN__2__1690403619_trial_10/Tetris_DQN.final
eval_episode=0, lines=58, episodic_return=546.672119140625
eval_episode=1, lines=58, episodic_return=546.672119140625
eval_episode=

[32m[I 2023-07-27 06:16:35,589][0m Trial 10 finished with value: 58.0 and parameters: {'model': 'small', 'learning_rate': 2.6520429119813746e-05, 'gamma': 0.019454281496323948, 'tau': 0.00023700050502165382, 'alpha': 0.2325052044908301, 'beta': 0.8787221954805188, 'frame_stack': 3, 'target_network_frequency': 941, 'end_e': 0.0044393335292431495, 'exploration_fraction': 0.4553307593896802, 'line_weight': 9.449614719505279, 'height_weight': 0.04151514193571715, 'cost_weight': 0.021366493918268836, 'holes_weight': 1.33941162513085, 'bumpiness_weight': 1.9819384980953383, 'col_transitions_weight': 0.017767867266601733, 'row_transitions_weight': 0.05024819975445771}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=58, episodic_return=546.672119140625
trial_10: evaluation_3: mean_lines=58.0
New best mean lines: 0.36
New best mean lines: 0.16
New best mean lines: 0.2
New best mean lines: 0.24
trial_6: model saved to runs/Tetris_DQN__2__1690365286_trial_6/Tetris_DQN.final
eval_episode=0, lines=0, episodic_return=-4.941319465637207
eval_episode=1, lines=0, episodic_return=-4.941319465637207
eval_episode=2, lines=0, episodic_return=-4.941319465637207
eval_episode=3, lines=0, episodic_return=-4.941319465637207


[32m[I 2023-07-27 06:29:19,728][0m Trial 6 finished with value: 0.0 and parameters: {'model': 'large', 'learning_rate': 0.0006656742033783309, 'gamma': 0.006624960614347199, 'tau': 0.0008820264423630012, 'alpha': 0.026443646952775346, 'beta': 0.9778033331049109, 'frame_stack': 3, 'target_network_frequency': 432, 'end_e': 0.16566219983651065, 'exploration_fraction': 0.28706595952429825, 'line_weight': 6.554047002363422, 'height_weight': 0.6982470526930817, 'cost_weight': 1.4903397385325974, 'holes_weight': 0.6985545999027636, 'bumpiness_weight': 1.6304926101623936, 'col_transitions_weight': 1.6094331915509437, 'row_transitions_weight': 0.38130755964449525}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-4.941319465637207
trial_6: evaluation_3: mean_lines=0.0
eval_episode=0, lines=0, episodic_return=-1.8150634765625
eval_episode=1, lines=0, episodic_return=-1.8150634765625
eval_episode=2, lines=0, episodic_return=-1.8150634765625
eval_episode=3, lines=0, episodic_return=-1.8150634765625
eval_episode=4, lines=0, episodic_return=-1.8150634765625
trial_11: evaluation_1 mean_lines=0.0
New best mean lines: 0.12
New best mean lines: 0.28
New best mean lines: 0.32
New best mean lines: 0.36
New best mean lines: 0.4
New best mean lines: 0.44
New best mean lines: 0.52
New best mean lines: 0.56
New best mean lines: 0.64
New best mean lines: 0.68
New best mean lines: 0.76
New best mean lines: 0.84
New best mean lines: 0.96
New best mean lines: 1.0
New best mean lines: 1.28
New best mean lines: 1.32
New best mean lines: 1.36
New best mean lines: 1.4
New best mean lines: 1.48
New best mean lines: 1.52
New best mean lines: 1.56
New best mean lines: 1.64
New b

[32m[I 2023-07-27 09:12:46,316][0m Trial 9 finished with value: 0.0 and parameters: {'model': 'small', 'learning_rate': 0.0009733653534696126, 'gamma': 0.028556335937489435, 'tau': 0.0003151571777377432, 'alpha': 0.09330580906250849, 'beta': 0.9574540927340788, 'frame_stack': 2, 'target_network_frequency': 9623, 'end_e': 0.012731227559672865, 'exploration_fraction': 0.44671265698842944, 'line_weight': 9.861097764759176, 'height_weight': 0.0951081495066976, 'cost_weight': 1.0356476610849936, 'holes_weight': 1.9400068125324688, 'bumpiness_weight': 1.8447904563155793, 'col_transitions_weight': 1.925789170745073, 'row_transitions_weight': 0.7429873682434054}. Best is trial 3 with value: 102.0.[0m


eval_episode=4, lines=0, episodic_return=-4.3083271980285645
trial_9: evaluation_3: mean_lines=0.0
eval_episode=0, lines=2, episodic_return=-0.5646430253982544
eval_episode=1, lines=2, episodic_return=-0.5646430253982544
eval_episode=2, lines=2, episodic_return=-0.5646430253982544
eval_episode=3, lines=2, episodic_return=-0.5646430253982544
eval_episode=4, lines=2, episodic_return=-0.5646430253982544
trial_13: evaluation_1 mean_lines=2.0
New best mean lines: 0.36
New best mean lines: 0.44
New best mean lines: 0.48
New best mean lines: 0.52
eval_episode=0, lines=76, episodic_return=538.0943603515625
eval_episode=1, lines=76, episodic_return=538.0944213867188
eval_episode=2, lines=76, episodic_return=538.0943603515625
eval_episode=3, lines=76, episodic_return=538.0944213867188
eval_episode=4, lines=76, episodic_return=538.0943603515625
trial_11: evaluation_2 mean_lines=76.0
eval_episode=0, lines=0, episodic_return=-8.142951011657715
eval_episode=1, lines=0, episodic_return=-8.14295101165

In [None]:
%load_ext tensorboard

In [None]:
print(f"Number of finished trials: ", len(study.trials))
print(f"Best trial:")
best_trial = study.best_trial
print(f"\tValue: {best_trial.value}")
print(f"\tParams: ")
for key, value in best_trial.params.items():
    print(f"\t\t{key}: {value}")

print("\tUser attrs:")
for key, value in best_trial.user_attrs.items():
    print(f"\t\t{key}: {value}")

In [None]:
# Write report
study.trials_dataframe().to_csv(f"study_results_dqn_{study_name}_{study_num}.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

In [None]:
# class Args:
#     def __init__(self):
#         # Settings
#         self.exp_name = "Tetris_DQN"
#         self.run_id = int(time.time())
#         self.torch_deterministic = True
#         self.cuda = True
#         self.mps = False
#         self.capture_video = True
#         self.capture_inputs_video = True
#         self.save_model = True
#         self.eval_episodes = 5
#         self.backup_frequency = 50000
#         self.mean_score_count = 50
#         self.video_frequency = 200
#         self.reload_env_frequency = 49
#         self.prefetch = None

#         # Constant Hyper-Parameters
#         self.seed = 2
#         self.total_timesteps = 1_000_000
#         self.buffer_size = 50_000
#         self.learning_starts = 50_000
#         self.train_frequency = 1
#         self.start_e = 1
#         self.batch_size = 32

#         # Optimizable Hyper-Parameters
#         self.model = get_model_class('small')
#         self.learning_rate = 2e-4
#         self.gamma = 0.99
#         self.tau = 0.999
#         self.alpha = 0.6
#         self.beta = 0.5
#         self.frame_stack = 4
#         self.target_network_frequency = 2000
#         self.end_e = 0.1
#         self.exploration_fraction = 0.5

#         # Reward weights
#         self.line_weight = 10.0
#         self.height_weight = 1.0
#         self.cost_weight = 1.0
#         self.holes_weight = 1.0
#         self.bumpiness_weight = 1.0
#         self.col_transitions_weight = 1.0
#         self.row_transitions_weight = 1.0


# args = Args()