In [1]:
import numpy as np
from environment.gobang import WrapperGoBang
from agent import Agent
import random
import os
from loguru import logger

In [2]:
class GoBangRandomAgent(Agent):
    def step(self, obs, explore=True, legal_actions=None):
        return {'action': random.choice(legal_actions)}

In [3]:
def do_exp(agent, gobang_n, epochs, test_n, test_freq, result_path):
    random_agent = GoBangRandomAgent(None)
    loss_episode = []
    reward_episode = []
    win_ratio = []
    best_win_ratio = -np.inf
    env = WrapperGoBang(gobang_n)
    for i in range(epochs):
        loss_list, reward_sum = agent.train(env)
        logger.info(f'Episode {i}/{epochs}, average loss: {np.mean(loss_list) if loss_list else -1}, reward sum: {reward_sum}')
        if loss_list:
            loss_episode.append(np.array([i, np.mean(loss_list)]))
            reward_episode.append(np.array([i, reward_sum]))
        
        if i % test_freq == 0:
            win = 0
            for _ in range(test_n):
                win += agent.test(env, random_agent, display=False)
            logger.info(f'Episode {i}/{epochs}, win {win}/{test_n}')
            ratio = 1.0 * win / test_n
            win_ratio.append(np.array([i, ratio]))
            if ratio > best_win_ratio:
                agent.save()
    np.save(os.path.join(result_path, 'loss.npy'), np.array(loss_episode))
    np.save(os.path.join(result_path, 'reward.npy'), np.array(reward_episode))
    np.save(os.path.join(result_path, 'ratio.npy'), np.array(win_ratio))    
    

In [4]:
from agent import DQNAgent
from agent.model import ConvQModel
from agent.configuration import DQNConfig
from agent.exploration import LinearExploration
from agent.memory import PriorityMemory

In [5]:
eval_model = ConvQModel(
    in_channels=2,
    in_h=7,
    in_w=7,
    hidden_channels=32,
    output_dim=49,
    dueling=True
)
target_model = ConvQModel(
    in_channels=2,
    in_h=7,
    in_w=7,
    hidden_channels=32,
    output_dim=49,
    dueling=True
)
exploration = LinearExploration(
    init_epsilon=1.0,
    min_epsilon=0.1,
    epsilon_decay=0.995
)
config = DQNConfig(
    eval_model=eval_model,
    target_model=target_model,
    ckpt_path='./dueling_ddqn_p_gobang_n7.ckpt',
    batch_size=32,
    lr=0.001,
    gamma=0.99,
    target_replace_frequency=100,
    capacity=5000,
    max_grad_norm=1.0,
    exploration=exploration,
    ddqn=True,
    memory=PriorityMemory(capacity=5000)
)
agent = DQNAgent(config)
do_exp(agent, 7, 5000, 50, 50, './experiment/dueling_ddqn_p_gobang_n7')

[32m2023-05-05 05:15:48.654[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 0/5000, average loss: 0.005114840668818654, reward sum: -48[0m
[32m2023-05-05 05:16:00.113[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m19[0m - [1mEpisode 0/5000, win 25/50[0m
[32m2023-05-05 05:16:01.361[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 1/5000, average loss: 0.0017512259598701676, reward sum: -40[0m
[32m2023-05-05 05:16:02.651[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 2/5000, average loss: 0.0015012598606808734, reward sum: -48.2[0m
[32m2023-05-05 05:16:03.594[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 3/5000, average loss: 0.0021279856634690156, reward sum: -33[0m
[32m2023-05-05 05:16:04.297[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 4/5000, average loss: 0.000573556

In [6]:
eval_model = ConvQModel(
    in_channels=2,
    in_h=11,
    in_w=11,
    hidden_channels=32,
    output_dim=121,
    dueling=True
)
target_model = ConvQModel(
    in_channels=2,
    in_h=11,
    in_w=11,
    hidden_channels=32,
    output_dim=121,
    dueling=True
)
exploration = LinearExploration(
    init_epsilon=1.0,
    min_epsilon=0.1,
    epsilon_decay=0.995
)
config = DQNConfig(
    eval_model=eval_model,
    target_model=target_model,
    ckpt_path='./dueling_ddqn_p_gobang_n11.ckpt',
    batch_size=32,
    lr=0.001,
    gamma=0.99,
    target_replace_frequency=100,
    capacity=5000,
    max_grad_norm=1.0,
    exploration=exploration,
    ddqn=True
)
agent = DQNAgent(config)
do_exp(agent, 11, 5000, 50, 50, './experiment/dueling_ddqn_p_gobang_n11')

[32m2023-05-05 07:17:04.663[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 0/5000, average loss: 0.047772567418524176, reward sum: -87[0m
[32m2023-05-05 07:17:59.797[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m19[0m - [1mEpisode 0/5000, win 17/50[0m
[32m2023-05-05 07:18:01.908[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 1/5000, average loss: 0.04840002902371109, reward sum: -65[0m
[32m2023-05-05 07:18:03.853[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 2/5000, average loss: 0.02156633467560155, reward sum: -52[0m
[32m2023-05-05 07:18:07.379[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 3/5000, average loss: 0.03406061205167688, reward sum: -85[0m
[32m2023-05-05 07:18:10.373[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_exp[0m:[36m10[0m - [1mEpisode 4/5000, average loss: 0.03346347081076185