In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import collections
import time

from random import choice
from kaggle_environments import evaluate, make, utils

In [2]:
Experience = collections.namedtuple('Experience', field_names=['observation', 'action', 'reward', 'done', 'new_observation'])

In [3]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        # * Get a number of batch_size experience in a range of len(self.buffer)
        # * It does not replace
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)

        observation, actions, rewards, dones, next_observation = zip(*[self.buffer[idx] for idx in indices])
        return np.array(observation), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_observation)

In [4]:
class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, shape):
        return shape.view(shape.size()[0], -1)


class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=3, padding = 1, stride=4),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=4, padding = 1, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding = 1, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        conv_out_size = self._get_conv_out(input_shape)

        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, input_shape):
        # * To make 3D shape, we have to put 1 in torch.zeros
        # * torch.zeros(1, 1, 52, 52)
        # * -> troch.zeros(batch_dimension, *shape)
        out = self.conv(torch.zeros(1, *input_shape))
        return int(np.prod(out.size()))

    def forward(self, x):
        conv_out = self.conv(x)
        fc_out = self.fc(conv_out)
        return fc_out

In [5]:
image = torch.rand(1, 1, 52, 52)
dqn = DQN(image[0].shape, 10)
print(dqn)
dqn(image)

DQN(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(4, 4), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): Flatten()
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)


tensor([[-0.0196, -0.0356, -0.0406,  0.0523,  0.0321, -0.0500,  0.0086,  0.0051,
         -0.0250, -0.0201]], grad_fn=<AddmmBackward>)

In [6]:
# /kaggle_environments/envs/connectx.py
def is_win(board, column, mark, config, has_played=True):
    columns = config.columns
    rows = config.rows
    inarow = config.inarow - 1
    row = (
        min([r for r in range(rows) if board[column + (r * columns)] == mark])
        if has_played
        else max([r for r in range(rows) if board[column + (r * columns)] == 0])
    )

    def count(offset_row, offset_column):
        for i in range(1, inarow + 1):
            r = row + offset_row * i
            c = column + offset_column * i
            if (
                r < 0
                or r >= rows
                or c < 0
                or c >= columns
                or board[c + (r * columns)] != mark
            ):
                return i - 1
        return inarow

    return (
        count(1, 0) >= inarow  # vertical.
        or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
        or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
        or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
    )


In [7]:
class Agent:
    def __init__(self, env, exp_buffer, mode="negamax"):
        configuration = env.configuration
        self.env = env
        self.columns = configuration['columns']
        self.rows = configuration['rows']

        self.trainer = env.train([None, mode])
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):        
        env_observation = self.trainer.reset()
        self.mark = env_observation['mark']
        self.board = env_observation['board']
        np_board = np.array(self.board)
        np_board = np_board.reshape(1, self.rows, -1)
        assert np_board.shape[2] == self.columns

        self.env.reset()
        self.observation = np_board
        self.total_reward = 0.0        

    def _select_random_action(self):   
        return choice([c for c in range (self.columns) if self.board[c] == 0])  
        
    def change_mode(self, mode):
        self.trainer = env.train([None, mode])

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = self._select_random_action()
        else:            
            observation_v = torch.from_numpy(self.observation).float().to(device)
            q_vals_v = net(observation_v)
            _, action = torch.max(q_vals_v, dim=1)
            action = int(action)

        new_observation, reward, done, _ = self.trainer.step(action)        
        
        if reward == None:
            done = True
            reward = -10   

        # Do not need to consider lose case. If agent can get high reward when it doen win        
        if done is not True and is_win(self.board, action, self.mark, self.env.configuration, has_played=False):
            reward = 10 

        if done == False:
            reward = -1                              
   
        self.total_reward += reward
        self.board = new_observation['board']

        new_observation = np.array(self.board)
        new_observation = new_observation.reshape(1, self.rows, -1)
        assert new_observation.shape[2] == self.columns
        
        exp = Experience(self.observation, action, reward, done, new_observation)

        self.exp_buffer.append(exp)
        self.observation = new_observation

        if done:
            done_reward = self.total_reward
            self._reset()     

        return done_reward

In [8]:
test_env = make("connectx", debug=True)
test_env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+



In [9]:
columns = test_env.configuration['columns']
rows = test_env.configuration['rows']

In [10]:
test_buffer = ExperienceBuffer(5)
agent = Agent(test_env, test_buffer)
epsilon = 0
input = torch.rand(1, 1, rows, columns)
n_actions = columns
net = DQN(input[0].shape, n_actions)
test_env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+



In [11]:
for i in range(10):
    agent.play_step(net, epsilon)

RuntimeError: Expected 4-dimensional input for 4-dimensional weight 32 1 3 3, but got 3-dimensional input of size [1, 6, 7] instead

In [12]:
test_env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+



In [13]:
def calc_loss(batch, net, tgt_net, device='cpu'):
    observation, actions, rewards, dones, next_observation = batch

    observation_v = torch.from_numpy(observation).float().to(device)
    next_observation_v = torch.from_numpy(next_observation).float().to(device)
    action_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.from_numpy(dones).to(device)

    state_action_value = net(observation_v).gather(1, action_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_observation_values = tgt_net(next_observation_v).max(1)[0]
        next_observation_values[done_mask] = 0.0
        next_observation_values = next_observation_values.detach()

    expected_state_action_values = next_observation_values * GAMMA + rewards_v

    loss = nn.MSELoss()
    return loss(state_action_value, expected_state_action_values)

In [14]:
GAMMA = 0.99
BATCH_SIZE = 1
REPLAY_SIZE = 80000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 5000
REPLAY_START_SIZE = 10

EPSILON_DECAY_LAST_FRAME = 50000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

In [15]:
env = make("connectx", debug=True)
rows, columns = env.configuration['rows'], env.configuration['columns']
input_shape = [1, rows, columns]
n_actions = columns
device = torch.device("cuda" if torch.cuda.is_available else 'cpu')
print(f"Run using {device}")

Run using cuda


In [16]:
net = DQN(input_shape, n_actions).to(device)    
tgt_net = DQN(input_shape, n_actions).to(device)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_m_reward = None

In [17]:
while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)   
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(total_rewards[-100:])
        print("%d : done %d games, reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), m_reward, epsilon, speed))

        if best_m_reward is None or best_m_reward < m_reward:
            torch.save(net.state_dict(), "best_%.0f.pth" % m_reward)
            if best_m_reward is not None:
                print("Best reward updated %.3f -> %.3f" % (best_m_reward, m_reward))
            best_m_reward = m_reward

    if len(buffer) < REPLAY_START_SIZE:
        continue
  
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())
        print("Sync")

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()

4 : done 1 games, reward -3.000, eps 1.00, speed 10.70 f/s
9 : done 2 games, reward -3.500, eps 1.00, speed 10.35 f/s
cal loss
cal loss
cal loss
13 : done 3 games, reward -3.333, eps 1.00, speed 9.87 f/s
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
23 : done 4 games, reward -4.750, eps 1.00, speed 11.10 f/s
cal loss
cal loss
cal loss
cal loss
cal loss
28 : done 5 games, reward -4.600, eps 1.00, speed 8.99 f/s
cal loss
cal loss
cal loss
cal loss
cal loss
cal loss
34 : done 6 games, reward -4.667, eps 1.00, speed 9.66 f/s
cal loss
cal loss
cal loss
cal loss
38 : done 7 games, reward -4.429, eps 1.00, speed 10.02 f/s
cal loss
cal loss
cal loss
cal loss
42 : done 8 games, reward -4.250, eps 1.00, speed 9.90 f/s
cal loss
cal loss
cal loss
cal loss
46 : done 9 games, reward -4.111, eps 1.00, speed 10.40 f/s
cal loss
cal loss
cal loss
cal loss
cal loss
51 : done 10 games, reward -4.100, eps 1.00, speed 9.27 f/s
cal loss
cal loss
cal loss
cal loss
5

KeyboardInterrupt: 