In [1]:
from kaggle_environments import evaluate, make, utils
from random import choice
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
import gym
from agents_utils import alphabeta_agent, block_check_agent

No pygame installed, ignoring import


In [2]:
def mean_reward(rewards):
    return sum(r[0] for r in rewards) / float(len(rewards))


In [4]:
class ConnectFourGym(gym.Env):
    def __init__(self, agent2=alphabeta_agent):
        super(ConnectFourGym, self).__init__()
        
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        self.inarow = ks_env.configuration.inarow

        # Action and observation space
        self.action_space = gym.spaces.Discrete(self.columns)
        self.observation_space = gym.spaces.Box(low=0, high=2, 
                                                shape=(1, self.rows, self.columns), dtype=np.int32)

        self.reward_range = (-10, 1)
        self.spec = None
        self.metadata = None

    def reset(self):
        self.obs = self.env.reset()
        board = np.array(self.obs['board']).reshape(1, self.rows, self.columns)
        return board.astype(np.float32)  

    def step(self, action):
        # Check if the move is valid 
        is_valid = self.obs['board'][action] == 0
        if is_valid:
            self.obs, old_reward, done, _ = self.env.step(action)
            reward = self._custom_reward(old_reward, done)
        else:
            # Penalize invalid moves
            reward, done, _ = -10, True, {}

        board = np.array(self.obs['board']).reshape(1, self.rows, self.columns).astype(np.float32)
        return board, reward, done, _

    def _custom_reward(self, old_reward, done):
        if old_reward == 1:      # Win
            return 1.0
        elif done:               # Loss
            return -1.0
        else:                    # Neutral move
            return 1.0 / (self.rows * self.columns)

In [5]:
class DQNCNN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQNCNN, self).__init__()
        c, h, w = input_shape  # shape: (1, 6, 7)
        
        self.conv1 = nn.Conv2d(c, 64, kernel_size=4, stride=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=2, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output_size(c, h, w), 128)
        self.fc2 = nn.Linear(128, n_actions)
        self.dropout = nn.Dropout(0.25)


    def _get_conv_output_size(self, c, h, w):
        x = torch.zeros(1, c, h, w)
        x = self.conv1(x)
        x = self.conv2(x)
        return int(np.prod(x.size()))

    def forward(self, x):
        x = F.relu(self.conv1(x))      # [B, 64, H-3, W-3]
        x = F.relu(self.conv2(x))      # [B, 128, H-4, W-4]
        x = x.view(x.size(0), -1)      # flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)    

In [6]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*transitions)
        return (np.array(states), actions, rewards, np.array(next_states), dones)

    def __len__(self):
        return len(self.buffer)

In [21]:
import torch.optim as optim

def train_double_dql(env, episodes=1000, gamma=0.99, epsilon=1.0, epsilon_decay=0.999, 
              min_epsilon=0.01, batch_size=64, buffer_size=10_000, target_update=10, model_save_path="./models/double_dql_minimax_1.pt"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_shape = (1, env.rows, env.columns)
    n_actions = env.columns

    policy_net = DQNCNN(input_shape, n_actions).to(device)
    target_net = DQNCNN(input_shape, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
    replay_buffer = ReplayBuffer(buffer_size)

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

            # ε-greedy action selection
            if random.random() < epsilon:
                action = random.randint(0, n_actions - 1)
            else:
                with torch.no_grad():
                    q_values = policy_net(state_tensor)
                    action = q_values.argmax().item()

            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            replay_buffer.push(state, action, reward, next_state, done)
            state = next_state

            # Sample and train
            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                states_tensor = torch.tensor(states, dtype=torch.float32).to(device)
                actions_tensor = torch.tensor(actions).unsqueeze(1).to(device)
                rewards_tensor = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(device)
                next_states_tensor = torch.tensor(next_states, dtype=torch.float32).to(device)
                dones_tensor = torch.tensor(dones, dtype=torch.float32).unsqueeze(1).to(device)

                # Q(s, a)
                q_values = policy_net(states_tensor).gather(1, actions_tensor)

                # Double DQN target: Q(s', argmax_a' Q(s', a'; policy), a'; target)
                with torch.no_grad():
                    next_actions = policy_net(next_states_tensor).argmax(1, keepdim=True)
                    next_q_values = target_net(next_states_tensor).gather(1, next_actions)

                target = rewards_tensor + gamma * next_q_values * (1 - dones_tensor)

                loss = F.mse_loss(q_values, target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * epsilon_decay)

        # Update target network
        if episode % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

        print(f"Episode {episode+1}, Total reward: {total_reward}, Epsilon: {epsilon:.3f}")

    # Save trained policy
    torch.save(policy_net.state_dict(), model_save_path)


In [22]:
def create_dql_agent(model_path, input_shape=(1, 6, 7), n_actions=7):
    model = DQNCNN(input_shape, n_actions)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()

    def agent(obs, config):
        board = np.array(obs['board']).reshape(1, config.rows, config.columns)
        board_tensor = torch.tensor(board, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            q_values = model(board_tensor)
            action = q_values.argmax().item()

        valid_moves = [c for c in range(config.columns) if obs['board'][c] == 0]
        if action not in valid_moves:
            action = random.choice(valid_moves)

        return action

    return agent

In [20]:
dql_agent_a1 = create_dql_agent("./models/dql_alphabeta_1.pt")
dql_agent_b1= create_dql_agent("./models/dql_blockcheck_1.pt")
dql_agent_b2= create_dql_agent("./models/dql_blockcheck_2.pt")
dql_agent_n1= create_dql_agent("./models/dql_negamax_1.pt")
dql_agent_n2 = create_dql_agent("./models/dql_negamax_2.pt")
double_dql_agent_b1 = create_dql_agent("./models/double_dql_blockcheck_1.pt")

In [23]:
env = ConnectFourGym(agent2=alphabeta_agent)
train_double_dql(env, episodes=10000)


Episode 1, Total reward: -0.8571428571428572, Epsilon: 0.999
Episode 2, Total reward: -0.6666666666666666, Epsilon: 0.998
Episode 3, Total reward: -0.8571428571428572, Epsilon: 0.997
Episode 4, Total reward: -0.9285714285714286, Epsilon: 0.996
Episode 5, Total reward: -0.9047619047619048, Epsilon: 0.995
Episode 6, Total reward: -0.8571428571428572, Epsilon: 0.994
Episode 7, Total reward: -0.8809523809523809, Epsilon: 0.993
Episode 8, Total reward: -0.9047619047619048, Epsilon: 0.992
Episode 9, Total reward: -9.714285714285714, Epsilon: 0.991
Episode 10, Total reward: -0.8809523809523809, Epsilon: 0.990
Episode 11, Total reward: -0.7857142857142857, Epsilon: 0.989
Episode 12, Total reward: -0.7857142857142857, Epsilon: 0.988
Episode 13, Total reward: -0.8571428571428572, Epsilon: 0.987
Episode 14, Total reward: -0.8571428571428572, Epsilon: 0.986
Episode 15, Total reward: -0.8809523809523809, Epsilon: 0.985
Episode 16, Total reward: -0.9285714285714286, Epsilon: 0.984
Episode 17, Total 

In [21]:
def get_win_percentages(agent1, agent2, n_rounds=100):
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    
    # Agent 1 goes first half the time
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    
    # Agent 2 goes first the other half 
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]

    total = len(outcomes)
    agent1_wins = outcomes.count([1, -1])
    agent2_wins = outcomes.count([-1, 1])
    ties = outcomes.count([0, 0])
    invalid_1 = outcomes.count([None, 0])
    invalid_2 = outcomes.count([0, None])

    print(f"Total games: {total}")
    print(f"Agent 1 Win Percentage: {agent1_wins / total:.2%}")
    print(f"Agent 2 Win Percentage: {agent2_wins / total:.2%}")
    print(f"Tie Percentage: {ties / total:.2%}")
    print(f"Invalid plays by Agent 1: {invalid_1}")
    print(f"Invalid plays by Agent 2: {invalid_2}")


In [28]:
get_win_percentages(double_dql_agent_b1, block_check_agent, n_rounds=200)

Total games: 200
Agent 1 Win Percentage: 17.00%
Agent 2 Win Percentage: 83.00%
Tie Percentage: 0.00%
Invalid plays by Agent 1: 0
Invalid plays by Agent 2: 0


In [27]:
print("Block check agent vs Random Agent:", mean_reward(evaluate("connectx", [double_dql_agent_b1, "random"], num_episodes=100)))
print("Block check agent vs Negamax Agent:", mean_reward(evaluate("connectx", [double_dql_agent_b1, dql_agent_n1], num_episodes=100)))

Block check agent vs Random Agent: 0.78
Block check agent vs Negamax Agent: -1.0
