In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import collections
import time

from random import choice
from kaggle_environments import evaluate, make, utils

In [2]:
Experience = collections.namedtuple('Experience', field_names=['observation', 'action', 'reward', 'done', 'new_observation'])

In [3]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        # * Get a number of batch_size experience in a range of len(self.buffer)
        # * It does not replace
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)

        observation, actions, rewards, dones, next_observation = zip(*[self.buffer[idx] for idx in indices])
        return np.array(observation), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_observation)

In [4]:
class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, shape):
        return shape.view(shape.size()[0], -1)


class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=3, padding = 1, stride=4),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=4, padding = 1, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding = 1, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        conv_out_size = self._get_conv_out(input_shape)

        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, input_shape):
        # * To make 3D shape, we have to put 1 in torch.zeros
        # * torch.zeros(1, 1, 52, 52)
        # * -> troch.zeros(batch_dimension, *shape)
        out = self.conv(torch.zeros(1, *input_shape))
        return int(np.prod(out.size()))

    def forward(self, x):
        conv_out = self.conv(x)
        fc_out = self.fc(conv_out)
        return fc_out

In [5]:
image = torch.rand(1, 1, 52, 52)
dqn = DQN(image[0].shape, 10)
print(dqn)
dqn(image)

DQN(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(4, 4), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): Flatten()
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)


tensor([[-0.0170, -0.0211,  0.0050, -0.0114, -0.0246, -0.0319, -0.0045,  0.0307,
         -0.0234, -0.0002]], grad_fn=<AddmmBackward>)

In [6]:
# /kaggle_environments/envs/connectx.py
def is_win(board, column, mark, config, has_played=True):
    columns = config.columns
    rows = config.rows
    inarow = config.inarow - 1
    row = (
        min([r for r in range(rows) if board[column + (r * columns)] == mark])
        if has_played
        else max([r for r in range(rows) if board[column + (r * columns)] == 0])
    )

    def count(offset_row, offset_column):
        for i in range(1, inarow + 1):
            r = row + offset_row * i
            c = column + offset_column * i
            if (
                r < 0
                or r >= rows
                or c < 0
                or c >= columns
                or board[c + (r * columns)] != mark
            ):
                return i - 1
        return inarow

    return (
        count(1, 0) >= inarow  # vertical.
        or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
        or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
        or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
    )


In [50]:
class Agent:
    def __init__(self, env, exp_buffer):
        configuration = env.configuration
        self.env = env
        self.columns = configuration['columns']
        self.rows = configuration['rows']        
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):        
        if np.random.random() < 0.3:
            self.trainer = self.env.train([None, "random"])
           # print("random mode")
            self.trainer.reset()
        else:
            self.trainer = self.env.train([None, "negamax"])
           # print("negamax mode")
            self.trainer.reset()

        env_observation = self.trainer.reset()
        self.mark = env_observation['mark']
        self.board = env_observation['board']
        np_board = np.array(self.board)
        np_board = np_board.reshape(1, self.rows, -1)
        assert np_board.shape[2] == self.columns

        self.env.reset()
        self.observation = np_board
        self.total_reward = 0.0       

    def _select_random_action(self):   
        return choice([c for c in range (self.columns) if self.board[c] == 0])  
        
    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = self._select_random_action()
        else:            
            observation_v = torch.from_numpy(self.observation).float().to(device)
            # Unsqueeze for batch size
            # [batch_size, channel, row, column]
            q_vals_v = net(observation_v.unsqueeze(0))
            _, action = torch.max(q_vals_v, dim=1)
            action = int(action)

        new_observation, reward, done, _ = self.trainer.step(action)        
        
        if reward == None:
            reward = -10

        # Do not need to consider lose case. If agent can get high reward when it doen win        
        if done is not True and is_win(self.board, action, self.mark, self.env.configuration, has_played=False):
            reward = 10

        if done == False:
            reward = 1                              
   
        self.total_reward += reward
        self.board = new_observation['board']

        new_observation = np.array(self.board)
        new_observation = new_observation.reshape(1, self.rows, -1)
        assert new_observation.shape[2] == self.columns
        
        exp = Experience(self.observation, action, reward, done, new_observation)

        self.exp_buffer.append(exp)
        self.observation = new_observation

        if done:
            done_reward = self.total_reward
            self._reset()     

        return done_reward

In [51]:
test_env = make("connectx", debug=True)
test_env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+



In [52]:
columns = test_env.configuration['columns']
rows = test_env.configuration['rows']

In [53]:
test_buffer = ExperienceBuffer(30)
agent = Agent(test_env, test_buffer)
epsilon = 0.5
input_shape = [1, rows, columns]
n_actions = columns
net = DQN(input_shape, n_actions)

In [68]:
test_env.render()
agent.play_step(net, epsilon)

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 1 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 1 | 2 | 1 | 0 |
+---+---+---+---+---+---+---+

6


In [69]:
test_env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 1 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 1 | 2 | 1 | 1 |
+---+---+---+---+---+---+---+



In [18]:
for i in range(50):    
    agent.play_step(net, epsilon)

In [28]:
observation, actions, rewards, dones, next_observation = test_buffer.sample(10)

In [29]:
observation

array([[[[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [2, 0, 0, 0, 1, 0, 0]]],


       [[[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0],
         [0, 0, 0, 0, 2, 1, 0],
         [0, 0, 0, 0, 2, 2, 0],
         [0, 1, 2, 1, 2, 1, 0]]],


       [[[2, 0, 0, 0, 0, 0, 0],
         [2, 0, 0, 0, 0, 1, 0],
         [1, 0, 0, 0, 0, 2, 0],
         [2, 0, 0, 0, 0, 1, 0],
         [2, 2, 1, 2, 1, 1, 0],
         [2, 2, 1, 2, 1, 1, 1]]],


       [[[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0],
         [0, 0, 0, 2, 2, 1, 0]]],


       [[[2, 0, 0, 0, 0, 0, 0],
         [2, 0, 0, 0, 0, 1, 0],
         [1, 0, 0, 0, 0, 2, 0],
         [2, 2, 0, 1, 0, 1, 0],
         [2, 2, 1, 2, 1, 1, 0],
         [2, 2, 1, 2, 1, 1, 1]]],


       [[[0, 0, 0, 0

In [31]:
actions

array([6, 5, 3, 5, 5, 3, 5, 0, 2, 5])

In [30]:
dones

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=uint8)

In [32]:
rewards

array([1., 1., 1., 1., 0., 1., 1., 1., 1., 1.], dtype=float32)

In [210]:
agent.play_step(net, epsilon)
test_env.render()

2
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+



In [21]:
def calc_loss(batch, net, tgt_net, device='cpu'):
    observation, actions, rewards, dones, next_observation = batch

    observation_v = torch.from_numpy(observation).float().to(device)
    next_observation_v = torch.from_numpy(next_observation).float().to(device)
    action_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.from_numpy(dones).to(device)

    state_action_value = net(observation_v).gather(1, action_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_observation_values = tgt_net(next_observation_v).max(1)[0]
        next_observation_values[done_mask] = 0.0
        next_observation_values = next_observation_values.detach()

    expected_state_action_values = next_observation_values * GAMMA + rewards_v

    loss = nn.MSELoss()
    return loss(state_action_value, expected_state_action_values)

In [22]:
GAMMA = 0.99
BATCH_SIZE = 4096
REPLAY_SIZE = 500000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 5000
REPLAY_START_SIZE = 5000

EPSILON_DECAY_LAST_FRAME = 80000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

In [23]:
env = make("connectx", debug=True)
rows, columns = env.configuration['rows'], env.configuration['columns']
input_shape = [1, rows, columns]
n_actions = columns
device = torch.device("cuda" if torch.cuda.is_available else 'cpu')
print(f"Run using {device}")

Run using cuda


In [24]:
net = DQN(input_shape, n_actions).to(device)    
tgt_net = DQN(input_shape, n_actions).to(device)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_m_reward = None

In [25]:
reward_for_plot = []
mean_reward_for_plot = []
loss_for_plot = []

In [56]:
while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)   
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(total_rewards[-10:])
        print("%d : done %d games, reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), m_reward, epsilon, speed))

        if best_m_reward is None or best_m_reward < m_reward:
            torch.save(net.state_dict(), "best_%.0f.pth" % m_reward)
            if best_m_reward is not None:
                print("Best reward updated %.3f -> %.3f" % (best_m_reward, m_reward))
            best_m_reward = m_reward

    if len(buffer) < REPLAY_START_SIZE:
        continue
  
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())
        print("Sync")

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()


    if frame_idx is not 0 and frame_idx % 5000 == 0:
        reward_for_plot.append(reward)
        mean_reward_for_plot.append(m_reward)
        loss_for_plot.append(loss_t)
        print("Append data")

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [82]:
valid_env = make("connectx")
valid_env.reset()
valid_trainer = valid_env.train([None, "negamax"])
valid_observation = valid_trainer.reset()
path = "./weights/best_16.pth"
path2 = "./win_best_17.pth"

In [170]:
import base64
with open(path, 'rb') as f:
    raw_bytes = f.read()
    encoded_weights = base64.encodebytes(raw_bytes)

with open("encoded_weights" + 'b64', 'wb') as f:
        f.write(encoded_weights)

In [171]:
encoded_weights

SVIPc5rUDww\nKg+8iLyMPaUSbb/mFhO9+U8hvJiNpzuuVTQ9JGxNPefx4r3j2HS88wljPguOtLwxP+C+l3yQO39l\n+z6fC6O9Cb1Jv46xqDyR4bo8DwFMv5VzDz1AAF48iX01PQYeYT2FitA81lutPU9bTz2XT4u9pHOO\nvo2YkL10YS67wF1UvrHyOr4/mOg8hb0TPoTPlj1rKAi9SHDKPZSMNbzAohE9vPQ3P23K5DyklyK+\n0AXQvlml1jtFVRM9V35VPJdUvTyoWty9ZTyTPdG/JL4u3AI86zCrPE4wWTzzzOI+TPbpvTYxyD0W\nn4m9U+81vHNRI75yJUo+sKlJPYNSAzxafIE99M9NP0K3obxV8qU81aWCvkbz2D4b1Rk+OFUev4+Y\nHr669W+/VUkOvKXAG7yRAYU9IGmrvfrvfLxGIfg8JLEXPznOfj6ohMo9l8SaPVwZgj6MTf+7n0QU\nPttNbr3GqS6+8yYpPcthpD0SZhG9gaXjvCAZmz/Dml28tVT+vOiL8L9aTKU9d6SjPONlzD1XPZw9\nPeOuPXebmb/bzMU8FG+mPblAPT7pPxs98wwiPWxsrDztJ+49eIzCvFAVJjwCh5a8pcr1vZ3CEDke\nQdW87wC7v2Ux0zxQUPg9vtIXvu591zyousE9OExfPy6Z2Tzscpq+Y9oJPeW60T2azuE9N2uaPdN/\n2T3NXiE/GdZnPEG88r1DqEM90tBxvrxgkT1Zrwg+W9IHv/aVkTwyi1S+d7C3PXGhMD2G9uC8UAYr\nPnyIUT1hRX4830COPWfM2j0YLa++VT3wvXIOfj9+D/o7HX+APQcak7008LE+xDlNPVki3j3iVQS9\nu46BPSmZKD4VkSA+dkSGvHrUDD2rKOQ8OFIOPemMHj57G6o9/EKQvc8dzrz9fq+80+QnvmJ0Gz5Z\naUI93QfBPbNuPz7dZFG+jueQPoAcVDxTNwe7xHMCPizp7jzePYU

In [149]:
import io
import base64
import torch

In [150]:
def select_network_action(q_vals_v, observation):
    actions = torch.argsort(q_vals_v, descending=True, dim=1)
    actions = actions.view(-1)

    for action in actions:            
        action = action.item()
        if observation.board[action] == 0:                
            return action

In [151]:
def my_agent(observation, configuration):
    decoded = base64.b64decode(encoded_weights)
    buffer = io.BytesIO(decoded)

    rows, columns = configuration['rows'], configuration['columns']
    input_shape = [1, rows, columns]
    n_actions = columns

    board = observation['board']
    np_board = np.array(board)
    np_board = np_board.reshape(1, rows, -1)
    assert np_board.shape[2] == columns
    
    net = DQN(input_shape, n_actions)
    net.load_state_dict(torch.load(buffer))
    net.eval()

    observation_v = torch.from_numpy(np_board).float()
    q_vals_v = net(observation_v.unsqueeze(0))
    action = select_network_action(q_vals_v, observation)

    return int(action)

In [152]:
action = my_agent(valid_observation, env.configuration)

Error: Incorrect padding

In [142]:
valid_env.reset()
valid_env.run([my_agent, "negamax"])
valid_env.render(mode="ipython", width=500, height=450)

In [120]:
import inspect
import os

def write_agent_to_file(function, file):
    with open(file, "a" if os.path.exists(file) else "w") as f:
        f.write(inspect.getsource(function))
        print(function, "written to", file)

write_agent_to_file(my_agent, "submission.py")

<function my_agent at 0x7f40cfea29d0> written to submission.py


In [166]:
import sys
out = sys.stdout
submission = utils.read_file("./submission_test.py")
agent = utils.get_last_callable(submission)
sys.stdout = out

env = make("connectx", debug=True)
env.run([agent, agent])
print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")

InvalidArgument: No callable found