In [1]:
!pip install gym-tetris
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.*

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym-tetris
  Downloading gym_tetris-3.0.4-py3-none-any.whl (34 kB)
Collecting nes-py>=8.1.4
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 4.7 MB/s 
Collecting pyglet<=1.5.21,>=1.4.0
  Downloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 48.6 MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-8.2.1-cp38-cp38-linux_x86_64.whl size=438537 sha256=0e30e7b9ff0a9544204d978111ee438b6e50db9259ba4c8f1fbcee6449b8a629
  Stored in directory: /root/.cache/pip/wheels/17/e5/5c/8dfae61b44dbf56c458483aa09accef55a650e0527f6cbd872
Successfully built nes-py
Installing collected packages: pyglet, nes-py, gym-tetris
Successfully installed gym-tetris-3.0.4 nes-py-8.2.1 pyglet-1.5.21
Reading package lists.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# change directory using the magic command %cd
### replace [MY PATH] below with your own path in Google Drive ###
%cd /content/drive/My\ Drive/

Mounted at /content/drive
/content/drive/My Drive


In [21]:
#This is setting up the relevant packages
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from nes_py.wrappers import JoypadSpace
import gym_tetris
from gym_tetris.actions import MOVEMENT

import time

env = gym_tetris.make('TetrisA-v3').unwrapped
env = JoypadSpace(env, MOVEMENT)


#set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  deprecation(
  deprecation(


In [6]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=4, stride=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Linear(5,32)

        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 256
        self.head = nn.Linear(30720, outputs)

    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.conv3(x))
        return self.head(x.view(x.size(0), -1))

In [7]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 2000
TARGET_UPDATE = 2000
OPTIM_UPDATE = 1

init_screen = env.observation_space.sample()
screen_height, screen_width, _ = init_screen.shape

n_actions = env.action_space.n

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.AdamW(policy_net.parameters(), lr=0.0001, weight_decay=0.01, amsgrad=True)
memory = ReplayMemory(10000)
steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []

# This is a helper method to plot the duration of each training episode
def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.show()

In [8]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch


    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))


    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [9]:
#Testing the scaling of the input
from scipy.ndimage import zoom
state, reward, done, info = env.step(env.action_space.sample())
img = Image.fromarray(state, 'RGB')
img.save("tetris.jpg")
state = np.moveaxis(state,2,-3)
state = state[:,48:208,96:176]
state = state > 0
state = state.astype(np.uint8)*255
print(state.dtype)
state = zoom(state, (1,0.125, 0.125))
state = state > 220
state = state.astype(np.uint8)*255
#print(state)
state = np.moveaxis(state,0,-1 )
img = Image.fromarray(state, 'RGB')
img.save("tetrisSmol.jpg")


uint8


In [10]:
from scipy.ndimage import zoom

def stateResize(state):
  state = np.moveaxis(state,2,-3)
  state = state[:,48:208,96:176]
  state = state > 0
  state = state.astype(np.uint8)*255
  state = zoom(state, (1,0.125, 0.125))
  state = state > 220
  state = state.astype(int)*255
  return torch.tensor([state], device=device).type('torch.FloatTensor')

In [None]:
num_episodes = 200
maxDuration = 0

best_net = DQN(screen_height, screen_width, n_actions).to(device)
#Below is for successive training sessions
#policy_net.load_state_dict(torch.load('/content/drive/MyDrive/currentPolicy'))
#target_net.load_state_dict(torch.load('/content/drive/MyDrive/currentPolicy'))
#policy_net.load_state_dict(torch.load('/content/currentPolicy (7)'))
#target_net.load_state_dict(torch.load('/content/currentPolicy (7)'))
i = 1
since = time.time()
episode_durations = []


for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    state = stateResize(state)

    for t in count():

        action = select_action(state)

        next_state, reward, done, info = env.step(action.item())



        reward = torch.tensor([reward], device=device)

        

        # Observe new state. This calculates s_t+1.
        next_state = stateResize(next_state)

        memory.push(state, action, next_state, reward)

        # This sets s_t to s_t+1 so the loop can execute again.
        state = next_state

        if done:
            if(maxDuration < (t+1)):
              maxDuration = (t+1)
              best_net.load_state_dict(policy_net.state_dict())
        if (t % OPTIM_UPDATE == 0):
          optimize_model()
        if done:
            episode_durations.append(t + 1)
            break


        if t % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), '/content/drive/MyDrive/currentPolicy')
    print(f'{i_episode}: Ep done, Duration = {t}')
    optimizer = optim.AdamW(policy_net.parameters(), lr=0.0001, weight_decay=0.01, amsgrad=True)


print('Complete')
plot_durations()

  return torch.tensor([state], device=device).type('torch.FloatTensor')


0: Ep done, Duration = 4674
1: Ep done, Duration = 3642
2: Ep done, Duration = 6913
3: Ep done, Duration = 4585
4: Ep done, Duration = 4632
5: Ep done, Duration = 7989
6: Ep done, Duration = 4619
7: Ep done, Duration = 6846
8: Ep done, Duration = 4315
9: Ep done, Duration = 3804
10: Ep done, Duration = 1807
11: Ep done, Duration = 3349
12: Ep done, Duration = 1402
13: Ep done, Duration = 2302
14: Ep done, Duration = 3351
15: Ep done, Duration = 3022
16: Ep done, Duration = 1929
17: Ep done, Duration = 1239
18: Ep done, Duration = 1667
19: Ep done, Duration = 2316
20: Ep done, Duration = 699
21: Ep done, Duration = 890
22: Ep done, Duration = 864
23: Ep done, Duration = 1251
24: Ep done, Duration = 3629
25: Ep done, Duration = 1295
26: Ep done, Duration = 2891
27: Ep done, Duration = 3166
28: Ep done, Duration = 415
29: Ep done, Duration = 1839
30: Ep done, Duration = 808
31: Ep done, Duration = 1096
32: Ep done, Duration = 796
33: Ep done, Duration = 4274
34: Ep done, Duration = 7241
3

In [None]:
#For saving
torch.save(policy_net.state_dict(), 'currentPolicy')
torch.save(best_net.state_dict(), 'bestPolicy')
torch.save(policy_net.state_dict(), '/content/drive/MyDrive/currentPolicy')
torch.save(best_net.state_dict(), '/content/drive/MyDrive/bestPolicy')

In [None]:
#For testing/training
policy_net.load_state_dict(torch.load('/content/drive/MyDrive/currentPolicy',map_location=device))
target_net.load_state_dict(torch.load('/content/drive/MyDrive/currentPolicy',map_location=device))

In [22]:
#For Testing
policy_net.load_state_dict(torch.load('/content/tetrisPolicyBest',map_location=device))

<All keys matched successfully>

In [23]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder
testVideo = "testVideo.mp4"
env = gym_tetris.make('TetrisA-v3').unwrapped
env = JoypadSpace(env, MOVEMENT)
video = VideoRecorder(env, testVideo)
duration = 0
state = env.reset()
state = stateResize(state)
from base64 import b64encode
def render_mp4(videopath: str) -> str:

  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'

  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(


In [24]:
while True:
  video.capture_frame()

   # Select and perform an action
  action = policy_net(state).max(1)[1].view(1, 1)
  next_state, reward, done, info = env.step(action.item()) #For using the network
  #state, reward, done, info = env.step(env.action_space.sample()) #For Random sampleing
  next_state = stateResize(next_state)
  # Move to the next state
  state = next_state
  if(done):
    break

video.close()


See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [25]:
from IPython.display import HTML
html = render_mp4(testVideo)
HTML(html)