# https://www.kaggle.com/vyacheslavponomarev/goosecompetitionnn

In [None]:
# This Python                               3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra                     
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Импорты библиотек

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import pickle
import bz2
import base64
from collections import deque

# Setting up the environment

Класс Env отвечает за окружение.  
Данный класс умеет:
- Генерировать битву между 4 агентами
- Создавать поле и выставлять на нем еду и агентов в случайном порядке
- Выставлять стены лабиринта. <b>При числе стен лабиринта большем 3, не гарантируется связность</b>
- Выдавать агентам награду в соответствии с последним совершенным ходом

In [None]:
import copy
from kaggle_environments.envs.hungry_geese.hungry_geese import Action

N_ROWS = 7
N_COLS = 11
N_CELL = N_ROWS * N_COLS
max_moves = 200

ACTIONS = [
    Action.NORTH.name,
    Action.WEST.name,
    Action.SOUTH.name,
    Action.EAST.name,
]

class Env:
    __slots__ = 'board', 'num_move', 'prev_opp_actions_opposite', 'geese', 'rewards', 'foods', 'agents', 'alive', 'b_state', 'is_game_end', 'labyrint'
    
    def __init__(self, agents, b_state={
            'blank': 0, # Empty cell
            'x': -1, # Head of opponent player
            'h': 1, # Head of current player
            'b': -2, # Body cell
            'f': 2, # Food
        }, verbose=False, labyrint=0):
        self.agents = agents
        self.init_game(b_state, labyrint, verbose)
        pass
    
    def init_game(self, b_state, labyrint, verbose=False):
        self.labyrint = labyrint
        self.is_game_end = False
        self.alive = np.ones(len(self.agents))
        
        # строим доску
        self.b_state = b_state
        
        board = b_state['blank'] * np.zeros(N_CELL).astype(int).reshape((N_ROWS, N_COLS))
        self.num_move = 0
        
        n = np.random.choice(N_CELL, replace=False, size=len(self.agents) + 2 + self.labyrint)
        self.geese = [[n[0]], [n[1]], [n[2]], [n[3]]]
        if labyrint:
            self.foods = n[4:-self.labyrint]
            laby_cells = n[-self.labyrint:]
        else:
            self.foods = n[4:]
            laby_cells = []
        for goose in self.geese:
            if len(goose) <= 0:
                continue
            for i, cell in enumerate(goose):
                row, col = self.row_col(cell)
                board[row][col] = b_state['b']
        # food
        for food in self.foods:
            food_row, food_col = self.row_col(food)
            if board[food_row][food_col] != b_state['blank']:
                raise Exception('Can\'t initialize board: place for food is not empty')
            board[food_row][food_col] = b_state['f']
        # make labyrint
        for cell in laby_cells:
            laby_row, laby_col = self.row_col(cell)
            if board[laby_row][laby_col] != b_state['blank']:
                raise Exception('Can\'t initialize board: place for labyrint is not empty')
            board[laby_row][laby_col] = b_state['b']
        self.board = board
        self.prev_opp_actions_opposite = [-1, -1, -1, -1]
        if verbose:
            print('INIT BOARD')
            self.print_board()
    
    def set_agents(self, agents):
        if len(agents) != 4:
            raise Exception('Num agents must be equal 4')
        self.agents = agents
    
    def game_start(self, verbose=False):
        self.prev_opp_actions_opposite = [-1, -1, -1, -1]
        if self.is_game_end:
            self.init_game(self.b_state, self.labyrint)
        if verbose:
            self.print_board()
        while not self.is_game_end:
            self.next_move(verbose=verbose)
        if verbose:
            print('Game end.')
    
    def next_move(self, verbose=False):
        self.rewards = [0, 0, 0, 0]
        if self.is_game_end:
            return self.is_game_end
        # Predict move
        moves = [-1, -1, -1, -1]
        for i, a in enumerate(self.agents):
            if self.alive[i]:
                state = self.get_state_num_player(i)
                # print('STATE: \n', state)
                moves[i] = a.get_action(state)
        if self.num_move >= 199:
            self.is_game_end=True
        # Here is move
        deads, heads = self.move(moves, verbose=verbose)
        # Check is game end
        num_alive = 0
        for a in self.alive:
            if a:
                num_alive += 1
        if num_alive < 2:
            self.is_game_end = True
        # Rewards
        for i, a in enumerate(self.agents):
            if deads[i]:
                r = self.rewards[i]
                a.train(r, self.get_state_num_player(i, heads), is_game_end = True)
                self.alive[i] = False
            if self.alive[i]:
                r = self.rewards[i]
                a.train(r, self.get_state_num_player(i, heads), is_game_end = self.is_game_end)
        # Move done
        self.num_move += 1
        if verbose:
            print(self.rewards)
            self.print_board()
        return self.is_game_end
    
    def move(self, moves, verbose=False):
        if verbose:
            print('Moves: {}'.format(' '.join(map(str, moves))))
        len_geese = len(self.geese)
        b_state = self.b_state
        deads = [False, False, False, False]
        prev_pos = []
        # Старые позиции голов
        old_heads = [goose[0] if len(goose) > 0 else -1 for goose in self.geese]
        if verbose:
            print('Old heads positions: {}'.format(' '.join(map(str, old_heads))))
        # Проверяем, не походил ли игрок назад
        for i in range(len_geese):
            if self.alive[i] and (moves[i] == self.prev_opp_actions_opposite[i]):
                deads[i] = True
                if verbose:
                    print("Player {} move back: {}, last moves: {}".format(i, moves, self.prev_opp_actions_opposite))
                self.rewards[i] -= 0
        # Вычисляем новые позиции - удаляем хвост игрока
        for i in range(len_geese):
            if self.alive[i]:
                cell = self.geese[i][-1]
                row, col = self.row_col(cell)
                self.board[row][col] = b_state['blank']
                prev_pos.append(cell)
                del self.geese[i][-1]
            else:
                prev_pos.append(0)
        # Новые позиции головы, проверка что они на валидном месте
        new_moves = [-1, -1, -1, -1]
        for i in range(len_geese):
            if self.alive[i]:
                if moves[i] == -1:
                    deads[i] = True
                    continue
                new_cell = self.get_head_cell(moves[i], old_heads[i])
                if new_cell == -1:
                    deads[i] = True
                    continue
                new_moves[i] = new_cell
        if verbose:
            print('New heads positions: {}'.format(' '.join(map(str, new_moves))))
        # Если две головы походили на одну и ту же клетку
        for i in range(len_geese):
            if self.alive[i]:
                for j in range(i + 1, len_geese):
                    if self.alive[j] and new_moves[i] == new_moves[j]:
                        deads[i] = True
                        deads[j] = True
                        break
        
        # Еда
        eaten = [False, False]
        for i in range(len_geese):
            if self.alive[i]:
                row, col = self.row_col(new_moves[i])
                if self.board[row][col] == b_state['f']:
                    # Гусь съел еду
                    self.rewards[i] += 1
                    self.geese[i].append(prev_pos[i])
                    # Добавляем клетку на доску
                    row_body, col_body = self.row_col(prev_pos[i])
                    self.board[row_body][col_body] = b_state['b']
                    # Проверяем какую еду съели
                    if new_moves[i] == self.foods[0]:
                        eaten[0] = True
                    else:
                        eaten[1] = True
            elif self.board[row][col] == b_state['f']:
                # Гусь съел еду
                self.rewards[i] += 1
                        
        # Every 40 turns decrease len goose by 1
        if self.num_move % 40 == 39:
            if verbose:
                print('Clear 1 cell from each goose')
            for i in range(4):
                if self.alive[i]:
                    if verbose:
                        print('Goose {} have len {}'.format(i, len(self.geese[i])))
                    if len(self.geese[i]) > 0:
                        cell = self.geese[i][-1]
                        row, col = self.row_col(cell)
                        self.board[row][col] = b_state['blank']
                        del self.geese[i][-1]
                    else:
                        if verbose:
                            print('Goose {} is dead'.format(i))
                        deads[i] = True
                        
        # Ставим головы гусей
        for i, coord in enumerate(new_moves):
            if self.alive[i]:
                row, col = self.row_col(coord)
                if self.board[row][col] == b_state['b']:
                    deads[i] = True
                    continue
                self.board[row][col] = b_state['b']
                self.geese[i].insert(0, coord)

        # Убираем убитых гусей
        for i in range(len_geese):
            is_dead = deads[i]
            deads[i] = False
            if is_dead and self.alive[i]:
                self.alive[i] = False
                deads[i] = True
                # Punish for death
                self.rewards[i] -= 10
                for cell in self.geese[i]:
                    row, col = self.row_col(cell)
                    self.board[row][col] = b_state['blank']
                self.geese[i] = []
                
        # Генерируем еду если съели
        if eaten[0]:
            idx = np.random.randint(N_CELL)
            row, col = self.row_col(idx)
            while(self.board[row][col] != b_state['blank']):
                idx = np.random.randint(N_CELL)
                row, col = self.row_col(idx)
            self.board[row][col] = b_state['f']
            eaten[0] = False
            self.foods[0] = idx
        if eaten[1]:
            idx = np.random.randint(N_CELL)
            row, col = self.row_col(idx)
            while(self.board[row][col] != b_state['blank']):
                idx = np.random.randint(N_CELL)
                row, col = self.row_col(idx)
            self.board[row][col] = b_state['f']
            eaten[1] = False
            self.foods[1] = idx
            
        # Выдаем награду за выживаемость
        self.rewards = [r+1 if not deads[i] else r for i, r in enumerate(self.rewards)]
        # Генерируем предыдущие ходы
        self.prev_opp_actions_opposite = [self.get_opposite(move) for move in moves]
        return deads, new_moves
    
    def row_col(self, num, *args, **kwargs):
        row, col = int(num / 11), num % 11
        return row, col
    
    def get_opposite(self, move):
        if move not in ACTIONS:
            return -1
        if move == Action.NORTH.name:
            return Action.SOUTH.name
        if move == Action.SOUTH.name:
            return Action.NORTH.name
        if move == Action.EAST.name:
            return Action.WEST.name
        return Action.EAST.name
    
    def get_state(self):
        return self.board
    
    def print_board(self):
        print('Num move {}. Alive: {}'.format(self.num_move, ' '.join(map(str, self.alive))))
        for row in self.board:
            print(' '.join(map(str, row)))
        print()
        pass
    
    def get_state_num_player(self, num_player=0, heads=[]):
        if len(heads) == 0:
            heads = [h[0] if len(h) > 0 else -1 for h in self.geese]
        state = self.get_state()
        player_state = copy.deepcopy(state)
        for i in range(len(self.agents)):
            head = heads[i]
            if self.alive[i]:
                head_row, head_col = self.row_col(head)
                if num_player == i:
                    player_head = head
                    player_state[head_row][head_col] = self.b_state['h']
                else:
                    player_state[head_row][head_col] = self.b_state['x']
            elif num_player == i:
                player_head = head
        player_state = self.centroid_agent(player_state, player_head)
        return player_state
    
    def get_head_cell(self, action, cell):
        row, col = self.row_col(cell)
        new_row, new_col = row, col
        if action == Action.NORTH.name:
            new_row -= 1
        elif action == Action.SOUTH.name:
            new_row += 1
        elif action == Action.EAST.name:
            new_col += 1
        elif action == Action.WEST.name:
            new_col -= 1
        else:
            return -1
        new_row, new_col = new_row % N_ROWS, new_col % N_COLS
        if self.board[new_row][new_col] == self.b_state['b']:
            return -1
        return self.get_num_cell(new_row, new_col)
    
    def centroid_agent(self, board, head):
        head_row, head_col = self.row_col(head)
        # X is [0, 11), center is 5
        # Y is [0, 7), center is 3
        dX = head_col - 5
        dY = 3 - head_row
        # Сдвиг строк - dY
        if dY != 0:
            board = np.vstack((board[-dY:], board[:-dY]))
        # Сдвиг колонок - dX
        if dX != 0:
            board = np.hstack((board[:, dX:], board[:, :dX]))
        return board
        
    def get_num_cell(self, row, col):
        return 11 * row + col

# Creating the model 

The model is a basic DQN with two convolutional layers, a dense hidden layer and an output layer with 4 neurons and a softmax activation. Each neuron represents one of the four possible ACTIONS - `NORTH`, `WEST`, `SOUTH` and `EAST`.  

The input state must be a torch.LongTensor tensor of size `(batch_size, 1, H, W)`.

In [None]:
class DuelingDQN(nn.Module):
    def __init__(self, ACTIONS, input_shape=(4, 4, 5)):
        super(DuelingDQN, self).__init__()
        self.ACTIONS = ACTIONS

        n_neurons = np.prod(input_shape)
        # Advantages A(s, a)
        self.fc_a = nn.Sequential(
        nn.Flatten(),
        nn.Linear(n_neurons, int(n_neurons / 2)),
        nn.ReLU(),
        nn.Linear(int(n_neurons / 2), len(ACTIONS))
        )
        # Values of state V(s)
        self.fc_v = nn.Sequential(
        nn.Flatten(),
        nn.Linear(n_neurons, int(n_neurons / 2)),
        nn.ReLU(),
        nn.Linear(int(n_neurons / 2), 1)
        )

    def forward(self, state):
        val = self.fc_v(state)
        adv = self.fc_a(state)
        # Q(s, a) = V(s) + A(s, a)
        # Subtract the mean for stability
        return val + adv - adv.mean()

# Creating the agents

TrainingAgent is used to train the passed model.  
At each training step, TrainingAgent receives the current state of the environment (np.ndarray), makes a decision (according to the `ACTIONS`), passes the action back to the environment, receives the `reward`, `next_state` and the flag of episode completion, fills the buffers and trains the `training_model` according to the following algorithm:

1. Sample batch_size frames `(s, a, r, s', d)` from the buffers.
1. Calculate the predicted Q-values using the `training_model`.
1. Calculate the expected Q-values employing the `target_model` according to Bellman optimality equation
1. Compare the two Q-values using the loss function (specifically, MSE loss) and pass the gradients back to only the training model.
1. Sync the training and target model every `sync_time` steps.
1. Update the `epsilon` value.

In [None]:
class TrainingAgent():
    def __init__(self, training_model, target_model, loss=nn.MSELoss(), device=torch.device("cpu"),
                 batch_size=3, sync_time=250, buffer_size=100000,
                 eps=1, eps_min=1e-3, eps_decay_time=500,
                 gamma=0.5, learning_rate=3.1415e-4,
                 reward_lookup_window=10, reward_threshold=500,
                 ACTIONS=ACTIONS, prev_action=0, name='TrainingAgent',
                is_train = True):
        
        self.is_train = is_train
        self.prev_action = prev_action
        self.training_model = training_model
        self.training_model.device = device
        self.target_model = target_model
        self.target_model.device = device
        # self.optimizer = torch.optim.Adam(training_model.parameters(), lr=learning_rate)
        self.optimizer = torch.optim.Adam(training_model.parameters(), lr=learning_rate)
        self.loss = loss
        self.device = device
        
        self.state_buffer = deque(maxlen=buffer_size)
        self.action_buffer = deque(maxlen=buffer_size)
        self.next_state_buffer = deque(maxlen=buffer_size)
        self.reward_buffer = deque(maxlen=buffer_size)
        self.done_buffer = deque(maxlen=buffer_size) #is_game_end
        
        self.total_rewards = []
        self.total_reward = 0
        
        self.batch_size = batch_size
        self.sync_time = sync_time
        self.eps = eps
        self.eps_min = eps_min
        self.eps_decay_time = eps_decay_time
        self.gamma = gamma
        self.reward_lookup_window = reward_lookup_window
        self.reward_threshold = reward_threshold
        self.ACTIONS = ACTIONS
        self.name = name
        self.frames = 0
        self.best_mean_reward = 0
        self.finished_training = False
        self.input_size = None
                
    def get_action(self, state):
        """
            Receive the state (np.ndarray), transform it to the tensor format,
            choose an action and fill the buffers.
        """
        state = self.edit_last_player_move(state, self.prev_action)
        state_t = torch.FloatTensor(state).view(1, 1, *state.shape).to(self.device)
        if self.input_size is None:
            self.input_size = state_t.size()
        if self.is_train and np.random.rand() < self.eps:
            action = np.random.choice(range(len(self.ACTIONS)))
            while action == self.get_opposite_idx_move(self.prev_action):
                # Epsilon-greedy exploration
                action = np.random.choice(range(len(self.ACTIONS)))
        else:
            # Choose the action with the best reward
            ACTIONS_cur = self.training_model(state_t)
            action = ACTIONS_cur.max(1)[1]
        self.prev_action = action
        # Fill the buffers and pass the action to the environment
        self.state_buffer.append(state_t)
        self.action_buffer.append(action)
        return self.ACTIONS[action]
    
    def cut_buffers(self, window, shift=None):
        if shift is None:
            shift = int(window**0.5)
        self.state_buffer = self.state_buffer[::shift][-window:]
        self.action_buffer = self.action_buffer[::shift][-window:]
        self.reward_buffer = self.reward_buffer[::shift][-window:]
        self.next_state_buffer = self.next_state_buffer[::shift][-window:]
        self.done_buffer = self.done_buffer[::shift][-window:]
        
    def train(self, reward, state, is_game_end=False):
        """
            A training step. After passing the action to the environment, if the agent has survived it
            receives a reward, next state and the marker indicating the end of the episode (is_game_end).
            The agent adds this data to the buffers, calculates the Q-values and expected Q-values, 
        """
        state = self.edit_last_player_move(state, self.prev_action)
        # Receive the response from the environment about action taken 
        self.next_state_buffer.append(torch.FloatTensor(state).view(1, 1, *state.shape))
        self.reward_buffer.append(reward)
        self.done_buffer.append(is_game_end)
        
        # Update the number of frames and the epsilon value
        self.eps = max(self.eps_min, self.eps - self.frames * (self.eps-self.eps_min)/self.eps_decay_time)
        self.frames += 1
        
        # Sync if it's time to
        if self.frames % self.sync_time:
            self.target_model.load_state_dict(self.training_model.state_dict())
        
        # Add the rewards if the episode ended
        self.total_reward += reward
        if is_game_end:
            self.prev_action = -1
            self.total_rewards.append(self.total_reward)
            self.total_reward = 0
        
        if self.frames > self.reward_lookup_window:
            # If the current mean reward is better than the best mean reward, save the model
            if np.mean(self.total_rewards[-self.reward_lookup_window:]) > self.best_mean_reward:
                torch.save(self.training_model.state_dict(), self.name + '-best.dat')
                self.best_mean_reward = np.mean(self.total_rewards[-self.reward_lookup_window:])
            # If the current mean reward exceeds the threshold, stop training
            if np.mean(self.total_rewards[-self.reward_lookup_window:]) > self.reward_threshold:
                self.finished_training = True
        
        with torch.enable_grad():
            # Begin training
            self.optimizer.zero_grad()

            # Choose a batch of (s, a, r, s', d)
            # Use .cat() to concatenate the tensors in state buffers alongside the batch dimension
            idx = np.random.choice(range(len(self.done_buffer)), size=min(len(self.done_buffer), self.batch_size))
            states_v = torch.cat(list(self.state_buffer))[idx].type(torch.FloatTensor).to(self.device)
            ACTIONS_v = torch.tensor(self.action_buffer).to(self.device)[idx]
            next_states_v = torch.cat(list(self.next_state_buffer))[idx].type(torch.FloatTensor).to(self.device)
            rewards_v = torch.tensor(self.reward_buffer).to(self.device)[idx]
            done_mask = torch.ByteTensor(self.done_buffer).to(self.device)[idx]

            if len(states_v.size()) == 3:
                states_v = states_v.view(1, *states_v.size())
                next_states_v = next_states_v.view(1, *next_states_v.size())
            
            # Collect the Q-values located at the position ACTIONS_v[i] (an integer) of vector (output of the model) a(states_v)[i]
            # and reshape it into a shape of ACTIONS_v
            # q_values = self.training_model(states_v).gather(1, ACTIONS_v.unsqueeze(-1)).squeeze(-1)
            q_values = self.training_model(states_v).gather(1, ACTIONS_v.unsqueeze(-1)).squeeze(-1).to(self.device)
            # Get the maximum Q-values of each next state from the target network, i.e.
            # max_a' Q'(s', a')
            # .max(i) returns a tuple of maximum values and their indices along i-th dimension
            # If the transition is from the last transition of the episode, then the expected Q' value is R, not gamma*Q' + R
            # To account for this, if the transition is the final transition of the episode, done_mask[i] is True,
            # so we null the Q' value at that index. Then the expected value would be equal to gamma*0 + R = R.
            target_q_values = self.target_model(next_states_v).to(self.device).max(1)[0]
            target_q_values[done_mask] = 0
            # Detach the target_q_values tensor so the target_model will not train
            target_q_values = target_q_values.detach()

            # Calculate the loss and make a step backwards through the training network
            expected_q_values = self.gamma * target_q_values + rewards_v
            L = self.loss(expected_q_values, q_values)
            L.backward()
            self.optimizer.step()
    
    def get_opposite_idx_move(self, idx):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        return (idx + 2) % 4
    
    def edit_last_player_move(self, board, idx_opposite):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        # CENTER BOARD [3, 5]
        if idx_opposite == 0:
            board[2][5] = -2
        elif idx_opposite == 1:
            board[3][4] = -2
        elif idx_opposite == 2:
            board[4][5] = -2
        elif idx_opposite == 3:
            board[3][6] = -2
        board_rot = np.zeros((7, 5))
        for i in range(7):
            for j in range(5):
                board_rot[i][j] = board[5 - j][2 + i]
        small_boards = np.array([
            board[:4, 3:8],
            board_rot[:4],
            board[3:, 3:8][::-1, ::-1],
            board_rot[3:][::-1, ::-1],
        ])
        return small_boards


# Training

Определим вспомогательные функции

In [None]:
def get_mean_reward(agent):
    r = np.mean(agent.total_rewards[-agent.reward_lookup_window:])
    return r if r is not np.nan else 'N/A'

In [None]:
def get_best_agent(rewards):
    mean_rewards = {agent: np.mean(reward) for agent, reward in rewards.items()}
    return sorted(mean_rewards.items(), key=lambda p: -p[1])[0]

In [None]:
LOAD_FROM_CHECKPOINT = False

In [None]:
if not LOAD_FROM_CHECKPOINT:
    model = DuelingDQN(ACTIONS=ACTIONS)
    generational_rewards = []
    torch.save(model, 'gen_model_dueling_dqn.dat')
else:
    with open('../input/dump--/dump.txt', 'r') as f:
        state_dict_dump = f.read()
        state_dict_dump = bytes(state_dict_dump, 'utf-8')
    state_dict = pickle.loads(bz2.decompress(base64.b64decode(state_dict_dump)))
    model = DuelingDQN(ACTIONS=ACTIONS)
    model.load_state_dict(state_dict)
    model.eval()
    torch.save(model, 'gen_model_dueling_dqn.dat')
    generational_rewards = []

# Соревнование с ботами

<code>IntermediateAgent</code> нужен для обертки функции agent в класс

In [None]:
class IntermediateAgent:
    __slots__ = 'action'
    
    def __init__(self, action=lambda board: "NORTH"):
        self.action = action
        pass
    
    def get_action(self, state=[[]]):
        return self.action(state)
    
    def train(self, reward=0, state=[[]], is_game_end=False):
        pass

In [None]:
def run_round_static(agents, static_agents, divide=2, games_per_match=10, labyrint=0):
    rewards = {}
    # Проходим все игры
    for agent in agents:
        for i in range(games_per_match):
            env = Env([agent, *static_agents], labyrint=labyrint)
            env.game_start(verbose=False)
        rewards[agent] = get_mean_reward(agent)
    half_items = sorted(rewards.items(), key=lambda p: -p[1])[:len(rewards) // divide]
    print('Best reward: {}'.format(half_items[0]))
    rewards = dict(half_items)
    return rewards.keys()

In [None]:
def run_competition_static(num_epoch=3, games_per_match=10, agent_class=TrainingAgent, 
                           static_agents_methods=[lambda state: "NORTH"] * 3,
                           model_path='dueling_dqn_competition.dat', is_difference=False, state_agent=None,
                           labyrint=0, **kwargs):
    if state_agent:
        agents = [state_agent]
    else:
        agents = [initialize_agent(model_path, is_difference=is_difference, **kwargs) for _ in range(2**num_epoch)]
    bots = [
        IntermediateAgent(method) for method in static_agents_methods
    ]
    for i in range(num_epoch):
        print('EPOCH {}: '.format(i), end='')
        if state_agent:
            agents = run_round_static(agents, bots, games_per_match=games_per_match, divide=1, labyrint=labyrint)
        else:
            agents = run_round_static(agents, bots, games_per_match=games_per_match, labyrint=labyrint)
    return list(agents)[0]

# Объявление ботов

## SimpleRandomAgent

In [None]:
dir_ = 0

def set_dir_agent_simple_random():
    global dir_
    dir_ = np.random.randint(4)
    return dir_

def agent_simple_random(state=0):
    global dir_
    return ACTIONS[dir_]

## SharpyAgent

In [None]:
class SharpyAgent:
    def __init__(self):
        self.prev_opp_action = -1
        self.ACTIONS = ACTIONS
        pass
        
    def agent(self, state=0):
        # -1 = голова противника
        # -2 = тело любого игрока
        # [3, 5]
        # [ a x b
        #   x 0 x
        #   c x d]
        idx = -1
        # a
        if state[2, 4] == -1:
            if state[2, 5] >= 0 and self.prev_opp_action != 0:
                if state[3, 4] >= 0 and self.prev_opp_action != 1:
                    idx = np.random.randint(2)
                else:
                    idx = 0
            elif state[3, 4] >= 0 and self.prev_opp_action != 1:
                idx = 1
        # b
        if state[2, 6] == -1:
            if state[2, 5] >= 0 and self.prev_opp_action != 0:
                if state[3, 6] >= 0 and self.prev_opp_action != 3:
                    idx = [0, 3][np.random.randint(2)]
                else:
                    idx = 0
            elif state[3, 6] >= 0 and self.prev_opp_action != 3:
                idx = 3
        # c
        if state[4, 4] == -1:
            if state[4, 5] >= 0 and self.prev_opp_action != 2:
                if state[3, 4] >= 0 and self.prev_opp_action != 1:
                    idx = [2, 1][np.random.randint(2)]
                else:
                    idx = 2
            elif state[3, 4] >= 0 and self.prev_opp_action != 1:
                idx = 1
        # d
        if state[4, 6] == -1:
            if state[4, 5] >= 0 and self.prev_opp_action != 2:
                if state[3, 6] >= 0 and self.prev_opp_action != 3:
                    idx = [2, 3][np.random.randint(2)]
                else:
                    idx = 2
            elif state[3, 6] >= 0 and self.prev_opp_action != 3:
                idx = 3
        if idx == -1:
            idxs = [state[2, 5], state[3, 4], state[4, 5], state[3, 6]]
            idxs[self.prev_opp_action] = -100
            idx = np.argmax(idxs)
        # Запоминаем последний ход
        self.prev_opp_action = self.get_opposite_idx_move(idx)
        return ACTIONS[idx]
    
    def get_opposite_idx_move(self, idx):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        return (idx + 2) % 4

## GreedyAgent

In [None]:
class GreedyAgent:
    def __init__(self):
        self.prev_opp_action = -1
        self.ACTIONS = ACTIONS
        pass
        
    def agent(self, state=0):
        # 2 = еда
        # -1 = голова противника
        # -2 = тело любого игрока
        # [3, 5]
        # [        1,5
        #      2,4 2,5 2,6
        #  3,3 3,4 3,5 3,6 3,7
        #      4,4 4,5 4,6
        #          5,5]
        idxs = [state[2, 5], state[3, 4], state[4, 5], state[3, 6]]
        # a
        if state[2, 4] > 0:
            if state[2, 5] >= 0 and state[1, 5] != -1 and self.prev_opp_action != 0:
                if state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                    idxs[0] += 1
                    idxs[1] += 1
                else:
                    idxs[0] += 1
            elif state[3, 4] >= 0 and state[3, 3] != -1 and self.prev_opp_action != 1:
                idxs[1] += 1
        # b
        if state[2, 6] > 0:
            if state[2, 5] >= 0 and state[1, 5] != -1  and self.prev_opp_action != 0:
                if state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                    idxs[0] += 1
                    idxs[3] += 1
                else:
                    idxs[0] += 1
            elif state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                idxs[3] += 1
        # c
        if state[4, 4] > 0:
            if state[4, 5] >= 0 and state[5, 5] != -1  and self.prev_opp_action != 2:
                if state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                    idxs[2] += 1
                    idxs[1] += 1
                else:
                    idxs[2] += 1
            elif state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                idxs[1] += 1
        # d
        if state[4, 6] > 0:
            if state[4, 5] >= 0 and state[5, 5] != -1  and self.prev_opp_action != 2:
                if state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                    idxs[2] += 1
                    idxs[3] += 1
                else:
                    idxs[2] += 1
            elif state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                idxs[3] += 1
        if state[2, 5] < 0:
            idxs[0] -= 20
        if state[3, 4] < 0:
            idxs[1] -= 20
        if state[4, 5] < 0:
            idxs[2] -= 20
        if state[3, 6] < 0:
            idxs[3] -= 20
        if state[1, 5] == -1:
            idxs[0] -= 10
        if state[3, 3] == -1:
            idxs[1] -= 10
        if state[5, 5] == -1:
            idxs[2] -= 10
        if state[3, 7] == -1:
            idxs[3] -= 10
        idxs[self.prev_opp_action] = -100
        idx = np.argmax(idxs)
        # Запоминаем последний ход
        self.prev_opp_action = self.get_opposite_idx_move(idx)
        return ACTIONS[idx]
    
    def get_opposite_idx_move(self, idx):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        return (idx + 2) % 4

## Соревнование агентов друг с другом

In [None]:
def initialize_agent(model_path, agent_class=TrainingAgent, is_difference=False, **kwargs):
    if is_difference:
        training_model = DuelingDQN(ACTIONS=ACTIONS)
        target_model = DuelingDQN(ACTIONS=ACTIONS)
    else:
        if isinstance(model_path, (list, np.ndarray)):
            path = model_path[np.random.randint(len(model_path))]
        else:
            path = model_path
        training_model = torch.load(path)
        target_model = torch.load(path)
    return agent_class(training_model=training_model, target_model=target_model, **kwargs)

In [None]:
def run_match(agents, num_games, labyrint=0):
    """
        Takes a list of agents (# of agents is 4), plays num_games games.
        Returns the best agent, other agents get disqualified.
    """
    rewards = {agent: [] for agent in agents}
    for game in range(num_games):
        env = Env(agents, labyrint=labyrint)
        env.game_start(verbose=False)
        for agent in agents:
            rewards[agent].append(get_mean_reward(agent))
    return get_best_agent(rewards)

def run_round(agents, games_per_match, labyrint=0):
    """
        Runs 1 round, taking 4 random agents each match. Returns the list of winners.
    """
    winners = []
    rewards = []
    np.random.shuffle(agents)
    i = 0
    while len(agents) > 0:
        
        # Get competitors
        competitors = agents[:4]
        # Deallocate memory
        del agents[:4]
        # Run match
        winner, reward = run_match(competitors, games_per_match, labyrint=labyrint)
        # print('MATCH {} | WINNER: {} | REWARD {}'.format(i, winner, reward))
        # Add to the winners
        winners.append(winner)
        rewards.append(reward)
        i += 1
    return winners, rewards

def run_competition(num_rounds=3, games_per_match=10, agent_class=TrainingAgent, model_path='dueling_dqn_competition.dat', is_difference=False,
                    labyrint=0, **kwargs):
    agents = [initialize_agent(model_path, is_difference=is_difference, **kwargs) for _ in range(4**num_rounds)]
    for i in range(num_rounds):
        agents, rewards = run_round(agents, games_per_match, labyrint=labyrint)
        print('ROUND {} | # WINNERS: {} | MAX REWARD: {}'.format(i, len(agents), max(rewards)))
    return agents[0]

# Обучение

In [None]:
device=torch.device("cpu")

## Обучение с нуля (при необходимости)

In [None]:
# Обучаем их не ходить назад
# Выбираем из них того, кто не ходит
model_path = 'gen_model_dueling_dqn.dat'
for i in range(1):
    winner = run_competition(device=device, sync_time=1000, gamma=1, games_per_match = 10, 
                             model_path=model_path, num_rounds=4, is_difference=True)
    torch.save(winner.training_model, model_path)
    model = torch.load(model_path)

## Загрузка сохраненного бота

In [None]:
model_load = ['agent_avoid_wall_v1.txt', 'agent_tango_v1.txt']
bots_paths = []
for i, path_load in enumerate(model_load):
    with open('../input/dump--/' + path_load, 'r') as f:
        state_dict_dump = f.read()
        state_dict_dump = bytes(state_dict_dump, 'utf-8')
    state_dict = pickle.loads(bz2.decompress(base64.b64decode(state_dict_dump)))
    model = DuelingDQN(ACTIONS=ACTIONS)
    model.load_state_dict(state_dict)
    model.eval()
    new_path = str(i) + '.dat'
    torch.save(model, new_path)
    model = torch.load(new_path)
    bots_paths.append(new_path)
bots_paths

Бот будет проходить тренировку против статичных ботов

In [None]:
agents_sharpy = [GreedyAgent(), GreedyAgent(), GreedyAgent()]
methods_agents_sharpy = [a.agent for a in agents_sharpy]
for i in range(10):
    winner = run_competition_static(num_epoch=1, games_per_match=20, agent_class=TrainingAgent, batch_size=50,
                           static_agents_methods=methods_agents_sharpy, model_path=model_path, state_agent=winner)
    torch.save(winner.training_model, model_path)
    model = torch.load(model_path)

In [None]:
# На лучшей из стартовых моделей улучшаем ее (она прошла испытание ботами)
for i in range(1):
    winner = run_competition(device=device, batch_size=500, sync_time=1000, gamma=1, games_per_match = 100, model_path=model_path, num_rounds=1,
                            labyrint=20)
    torch.save(winner.training_model, model_path)
    model = torch.load(model_path)

In [None]:
agents_sharpy = [SharpyAgent(), SharpyAgent(), GreedyAgent()]
methods_agents_sharpy = [a.agent for a in agents_sharpy]
for i in range(10):
    winner = run_competition_static(num_epoch=1, games_per_match=20, agent_class=TrainingAgent, batch_size=250,
                           static_agents_methods=methods_agents_sharpy, model_path=model_path, state_agent=winner)
    torch.save(winner.training_model, model_path)
    model = torch.load(model_path)

In [None]:
# На лучшей из стартовых моделей улучшаем ее (она прошла испытание ботами)
for i in range(5):
    winner = run_competition(device=device, batch_size=500, sync_time=1000, gamma=1, games_per_match = 20, model_path=model_path, num_rounds=1, labyrint=0)
    torch.save(winner.training_model, model_path)
    model = torch.load(model_path)

Сохраняем бота для дальнейшей загрузки

In [None]:
state_dump = base64.b64encode(bz2.compress(pickle.dumps(winner.training_model.state_dict())))
with open('WINNER_state_dict.txt', 'w') as fin:
    fin.write(str(state_dump))

In [None]:
model = torch.load(model_path)

training_model1 = torch.load(model_path)
target_model1 = torch.load(model_path)
training_model2 = torch.load(model_path)
target_model2 = torch.load(model_path)
training_model3 = torch.load(model_path)
target_model3 = torch.load(model_path)
training_model4 = torch.load(model_path)
target_model4 = torch.load(model_path)
agents = [
    TrainingAgent(training_model=training_model1, target_model=target_model1, is_train = False),
    TrainingAgent(training_model=training_model2, target_model=target_model2, is_train = False),
    TrainingAgent(training_model=training_model3, target_model=target_model3, is_train = False),
    TrainingAgent(training_model=training_model4, target_model=target_model4, is_train = False),
    ]

rewards = {agent: [] for agent in agents}
env = Env(agents, labyrint=10)
print("GAME START")
env.game_start(verbose=True)
print("GAME END")
for agent in agents:
    rewards[agent].append(get_mean_reward(agent))

# PRODUCTION

Код submission.py отправляется на сервер.

<b>Внимание.</b> Перед отправкой вручную вставьте содержимое файла <code>WINNER_state_dict.txt</code> в переменную <code>state_dict_dump</code>

In [None]:
%%writefile submission.py

import numpy as np
import pandas as pd
import pickle
import base64
import bz2

import torch
import torch.nn as nn
import torch.nn.functional as F

import copy
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col

N_ROWS = 7
N_COLS = 11
N_CELL = N_ROWS * N_COLS
max_moves = 200

ACTIONS = [
    Action.NORTH.name,
    Action.WEST.name,
    Action.SOUTH.name,
    Action.EAST.name,
]
# Верх лево низ право

class DuelingDQN(nn.Module):
    def __init__(self, actions, input_shape=(4, 4, 5)):
        super(DuelingDQN, self).__init__()
        self.actions = actions

        n_neurons = np.prod(input_shape)
        # Advantages A(s, a)
        self.fc_a = nn.Sequential(
        nn.Flatten(),
        nn.Linear(n_neurons, int(n_neurons / 2)),
        nn.ReLU(),
        nn.Linear(int(n_neurons / 2), len(actions))
        )
        # Values of state V(s)
        self.fc_v = nn.Sequential(
        nn.Flatten(),
        nn.Linear(n_neurons, int(n_neurons / 2)),
        nn.ReLU(),
        nn.Linear(int(n_neurons / 2), 1)
        )

    def forward(self, state):
        val = self.fc_v(state)
        adv = self.fc_a(state)
        # Q(s, a) = V(s) + A(s, a)
        # Subtract the mean for stability
        return val + adv - adv.mean()


class Agent:
    def __init__(self, model, actions):
        self.model = model
        self.actions = actions
        self.prev_opp_action = None
    
    def get_action(self, obs_dict, config_dict):
        """
            Receive the state (np.ndarray), transform it to the tensor format,
            choose an action and fill the buffers.
        """
        state = self.edit_last_player_move(self.process_state(obs_dict, config_dict), self.prev_opp_action)
        state_t = torch.FloatTensor(state).view(1, 1, 4, 4, 5)
        # Choose the action with the best reward
        q_values = self.model(state_t)[0].detach().numpy()
        if self.prev_opp_action:
            q_values[self.prev_opp_action] = -np.inf
        action_id = np.argmax(q_values)
        while action_id == self.prev_opp_action:
            action_id = (action_id + 1) % 4
        self.prev_opp_action = self.get_opposite_idx_move(action_id)
        return self.actions[action_id]
    
    def process_state(self, obs_dict, config_dict):
        observation = Observation(obs_dict)
        configuration = Configuration(config_dict)
        player_index = observation.index
        b_state={
            'blank': 0, # Empty cell
            'x': -1, # Head of opponent player
            'h': 1, # Head of current player
            'b': -2, # Body cell
            'f': 2, # Food
        }
        # строим доску
        board = b_state['blank'] * np.zeros(N_CELL).astype(int).reshape((N_ROWS, N_COLS))
        # гуси
        geese = observation.geese
        for goose in geese:
            for i, cell in enumerate(goose):
                row, col = row_col(cell, configuration.columns)
                board[row][col] = b_state['b']
        for i in range(len(geese)):
            if(len(geese[i]) <= 0):
                continue
            head_row, head_col = row_col(geese[i][0], configuration.columns)
            if player_index == i:
                board[head_row][head_col] = b_state['h']
            else:
                board[head_row][head_col] = b_state['x']
        # food
        foods = observation.food
        for food in foods:
            food_row, food_col = row_col(food, configuration.columns)
            board[food_row][food_col] = b_state['f']
        board = self.centroid_agent(board, geese[player_index][0], configuration)
        return board
    
    def get_opposite_idx_move(self, idx):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        return (idx + 2) % 4
    
    def edit_last_player_move(self, board, idx_opposite):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        # CENTER BOARD [3, 5]
        # [ 2,4 2,5 2,6
        #   3,4 3,5 3,6
        #   4,4 4,5 4,6]
        if idx_opposite == 0:
            board[2][5] = -2
        elif idx_opposite == 1:
            board[3][4] = -2
        elif idx_opposite == 2:
            board[4][5] = -2
        elif idx_opposite == 3:
            board[3][6] = -2
        board_rot = np.zeros((7, 5))
        for i in range(7):
            for j in range(5):
                board_rot[i][j] = board[5 - j][2 + i]
        small_boards = np.array([
            board[:4, 3:8],
            board_rot[:4],
            board[3:, 3:8][::-1, ::-1],
            board_rot[3:][::-1, ::-1],
        ])
        return small_boards
    
    def centroid_agent(self, board, head, configuration):
        head_row, head_col = row_col(head, configuration.columns)
        # X is [0, 11), center is 5
        # Y is [0, 7), center is 3
        dX = head_col - 5
        dY = 3 - head_row
        # Сдвиг строк - dY
        if dY != 0:
            board = np.vstack((board[-dY:], board[:-dY]))
        # Сдвиг колонок - dX
        if dX != 0:
            board = np.hstack((board[:, dX:], board[:, :dX]))
        return board
    
    def agent(self, obs_dict, config_dict):
        return self.get_action(obs_dict, config_dict)
    
    def __call__(self, obs_dict, config_dict, *args, **kwargs):
        return self.get_action(obs_dict, config_dict)

with open('WINNER_state_dict.txt', 'r') as f:
    state_dict_dump = f.read()[2:-1]
    state_dict_dump = bytes(state_dict_dump, 'utf-8')
state_dict = pickle.loads(bz2.decompress(base64.b64decode(state_dict_dump)))
model = DuelingDQN(actions=ACTIONS)
model.load_state_dict(state_dict)
model.eval()
myAgent = Agent(model=model, actions=ACTIONS)

def agent(obs_dict, config_dict):
    return myAgent(obs_dict, config_dict)

In [None]:
%run submission.py

## Обученный GreedyAgent (статический бот)

In [None]:
%%writefile greedy_agent.py
import numpy as np
import pandas as pd
import pickle
import base64
import bz2

import torch
import torch.nn as nn
import torch.nn.functional as F

import copy
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col

N_ROWS = 7
N_COLS = 11
N_CELL = N_ROWS * N_COLS
max_moves = 200


# Верх лево низ право

class GreedyAgent:
    def __init__(self):
        self.prev_opp_action = -1
        self.ACTIONS = (
            Action.NORTH.name,
            Action.WEST.name,
            Action.SOUTH.name,
            Action.EAST.name,
        )
        pass
        
    def get_action(self, obs_dict, config_dict):
        state = self.process_state(obs_dict, config_dict)
        # 2 = еда
        # -1 = голова противника
        # -2 = тело любого игрока
        # [3, 5]
        # [        1,5
        #      2,4 2,5 2,6
        #  3,3 3,4 3,5 3,6 3,7
        #      4,4 4,5 4,6
        #          5,5]
        idxs = [state[2, 5], state[3, 4], state[4, 5], state[3, 6]]
        # a
        if state[2, 4] > 0:
            if state[2, 5] >= 0 and state[1, 5] != -1 and self.prev_opp_action != 0:
                if state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                    idxs[0] += 1
                    idxs[1] += 1
                else:
                    idxs[0] += 1
            elif state[3, 4] >= 0 and state[3, 3] != -1 and self.prev_opp_action != 1:
                idxs[1] += 1
        # b
        if state[2, 6] > 0:
            if state[2, 5] >= 0 and state[1, 5] != -1  and self.prev_opp_action != 0:
                if state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                    idxs[0] += 1
                    idxs[3] += 1
                else:
                    idxs[0] += 1
            elif state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                idxs[3] += 1
        # c
        if state[4, 4] > 0:
            if state[4, 5] >= 0 and state[5, 5] != -1  and self.prev_opp_action != 2:
                if state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                    idxs[2] += 1
                    idxs[1] += 1
                else:
                    idxs[2] += 1
            elif state[3, 4] >= 0 and state[3, 3] != -1  and self.prev_opp_action != 1:
                idxs[1] += 1
        # d
        if state[4, 6] > 0:
            if state[4, 5] >= 0 and state[5, 5] != -1  and self.prev_opp_action != 2:
                if state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                    idxs[2] += 1
                    idxs[3] += 1
                else:
                    idxs[2] += 1
            elif state[3, 6] >= 0 and state[3, 7] != -1  and self.prev_opp_action != 3:
                idxs[3] += 1
        if state[2, 5] < 0:
            idxs[0] -= 20
        if state[3, 4] < 0:
            idxs[1] -= 20
        if state[4, 5] < 0:
            idxs[2] -= 20
        if state[3, 6] < 0:
            idxs[3] -= 20
        if state[1, 5] == -1:
            idxs[0] -= 10
        if state[3, 3] == -1:
            idxs[1] -= 10
        if state[5, 5] == -1:
            idxs[2] -= 10
        if state[3, 7] == -1:
            idxs[3] -= 10
        idxs[self.prev_opp_action] = -100
        idx = np.argmax(idxs)
        while idx == self.prev_opp_action:
            idx = (idx + 1) % 4
        # Запоминаем последний ход
        self.prev_opp_action = self.get_opposite_idx_move(idx)
        return self.ACTIONS[idx]
    
    def get_opposite_idx_move(self, idx):
        # ['NORTH', 'WEST', 'SOUTH', 'EAST']
        return (idx + 2) % 4
    
    def process_state(self, obs_dict, config_dict):
        observation = Observation(obs_dict)
        configuration = Configuration(config_dict)
        player_index = observation.index
        b_state={
            'blank': 0, # Empty cell
            'x': -1, # Head of opponent player
            'h': 1, # Head of current player
            'b': -2, # Body cell
            'f': 2, # Food
        }
        # строим доску
        board = b_state['blank'] * np.zeros(N_CELL).astype(int).reshape((N_ROWS, N_COLS))
        # гуси
        geese = observation.geese
        for goose in geese:
            for i, cell in enumerate(goose):
                row, col = row_col(cell, configuration.columns)
                board[row][col] = b_state['b']
        for i in range(len(geese)):
            if(len(geese[i]) <= 0):
                continue
            head_row, head_col = row_col(geese[i][0], configuration.columns)
            if player_index == i:
                board[head_row][head_col] = b_state['h']
            else:
                board[head_row][head_col] = b_state['x']
        # food
        foods = observation.food
        for food in foods:
            food_row, food_col = row_col(food, configuration.columns)
            board[food_row][food_col] = b_state['f']
        board = self.centroid_agent(board, geese[player_index][0], configuration)
        return board
    
    def centroid_agent(self, board, head, configuration):
        head_row, head_col = row_col(head, configuration.columns)
        # X is [0, 11), center is 5
        # Y is [0, 7), center is 3
        dX = head_col - 5
        dY = 3 - head_row
        # Сдвиг строк - dY
        if dY != 0:
            board = np.vstack((board[-dY:], board[:-dY]))
        # Сдвиг колонок - dX
        if dX != 0:
            board = np.hstack((board[:, dX:], board[:, :dX]))
        return board
    
    def __call__(self, obs_dict, config_dict, *args, **kwargs):
        return self.get_action(obs_dict, config_dict)

myAgent = GreedyAgent()

def agent(obs_dict, config_dict):
    return myAgent(obs_dict, config_dict)

In [None]:
%run greedy_agent.py

Запуск тестовой игры

In [None]:
from kaggle_environments import make
env = make("hungry_geese", debug=True)

env.reset()
# Save subm
env.run(['submission.py', 'submission.py', 'submission.py', 'submission.py'])
env.render(mode="ipython", width=800, height=700)