In [11]:
from kaggle_environments import evaluate, make, utils
from random import choice
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
import gym
from agents_utils import alphabeta_agent, block_check_agent, create_dql_agent, create_dueling_dql_agent

In [12]:
class ConnectFourGym(gym.Env):
    def __init__(self, agent2=None):
        super(ConnectFourGym, self).__init__()
        
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        self.inarow = ks_env.configuration.inarow

        # Action and observation space
        self.action_space = gym.spaces.Discrete(self.columns)
        self.observation_space = gym.spaces.Box(low=0, high=2, 
                                                shape=(1, self.rows, self.columns), dtype=np.int32)

        self.reward_range = (-10, 1)
        self.spec = None
        self.metadata = None

    def reset(self):
        self.obs = self.env.reset()
        board = np.array(self.obs['board']).reshape(1, self.rows, self.columns)
        return board.astype(np.float32)  

    def step(self, action):
        # Check if the move is valid 
        is_valid = self.obs['board'][action] == 0
        if is_valid:
            self.obs, old_reward, done, _ = self.env.step(action)
            reward = self._custom_reward(old_reward, done)
        else:
            # Penalize invalid moves
            reward, done, _ = -10, True, {}

        board = np.array(self.obs['board']).reshape(1, self.rows, self.columns).astype(np.float32)
        return board, reward, done, _

    def _custom_reward(self, old_reward, done):
        if old_reward == 1:      # Win
            return 1.0
        elif done:               # Loss
            return -1.0
        else:                    # Neutral move
            return 1.0 / (self.rows * self.columns)

In [13]:
class ActorCritic(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(ActorCritic, self).__init__()
        c, h, w = input_shape

        self.conv1 = nn.Conv2d(c, 64, kernel_size=4, stride=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=2, stride=1)
        conv_out_size = self._get_conv_output(c, h, w)

        self.fc_shared = nn.Linear(conv_out_size, 128)
        self.dropout = nn.Dropout(0.25)

        self.actor = nn.Linear(128, n_actions)   
        self.critic = nn.Linear(128, 1)            

    def _get_conv_output(self, c, h, w):
        x = torch.zeros(1, c, h, w)
        x = self.conv1(x)
        x = self.conv2(x)
        return int(np.prod(x.size()))

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc_shared(x))
        x = self.dropout(x)

        logits = self.actor(x)
        value = self.critic(x)
        return logits, value

class PPOAgent:
    def __init__(self, env, input_shape, n_actions, gamma=0.99, lr=0.0003, clip_eps=0.2, epochs=4, batch_size=64):
        self.env = env
        self.gamma = gamma
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.batch_size = batch_size

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = ActorCritic(input_shape, n_actions).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        logits, value = self.model(state)
        dist = torch.distributions.Categorical(logits=logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item(), value.item()

    def compute_returns_advantages(self, rewards, dones, values, next_value):
        returns = []
        R = next_value
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                R = 0
            R = reward + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns, dtype=torch.float32).to(self.device)
        values = torch.tensor(values, dtype=torch.float32).to(self.device)
        advantages = returns - values
        return returns, advantages

    def update(self, obs, actions, log_probs, returns, advantages):
        obs = torch.tensor(obs, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions).to(self.device)
        old_log_probs = torch.tensor(log_probs, dtype=torch.float32).to(self.device)
        returns = returns.to(self.device)
        advantages = advantages.to(self.device)

        for _ in range(self.epochs):
            logits, values = self.model(obs)
            dist = torch.distributions.Categorical(logits=logits)
            new_log_probs = dist.log_prob(actions)

            ratio = torch.exp(new_log_probs - old_log_probs)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advantages

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = F.mse_loss(values.squeeze(), returns)
            loss = actor_loss + 0.5 * critic_loss

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def train(self, total_timesteps=500_000, rollout_steps=2048):
        state = self.env.reset()
        episode_rewards = []
        total_steps = 0
        episode = 0

        while total_steps < total_timesteps:
            obs_list, action_list, logprob_list, reward_list, done_list, value_list = [], [], [], [], [], []

            for _ in range(rollout_steps):
                action, log_prob, value = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action)

                obs_list.append(state)
                action_list.append(action)
                logprob_list.append(log_prob)
                reward_list.append(reward)
                done_list.append(done)
                value_list.append(value)

                state = next_state
                total_steps += 1

                if done:
                    episode_reward = sum(reward_list)
                    episode += 1
                    print(f"Episode {episode}, Reward: {episode_reward}")
                    episode_rewards.append(episode_reward)
                    state = self.env.reset()

            # Bootstrap value
            with torch.no_grad():
                _, next_value = self.model(torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device))

            returns, advantages = self.compute_returns_advantages(reward_list, done_list, value_list, next_value.item())

            self.update(obs_list, action_list, logprob_list, returns, advantages)

        return episode_rewards

In [14]:
env = ConnectFourGym(agent2=block_check_agent)

input_shape = (1, env.rows, env.columns)   
n_actions = env.columns                    

agent = PPOAgent(env, input_shape, n_actions)

rewards = agent.train(total_timesteps=2_000_000)

Episode 1, Reward: -0.8809523809523809
Episode 2, Reward: 0.19047619047619047
Episode 3, Reward: -0.6428571428571429
Episode 4, Reward: -1.4761904761904763
Episode 5, Reward: -2.2142857142857144
Episode 6, Reward: -0.9523809523809524
Episode 7, Reward: -1.880952380952381
Episode 8, Reward: -2.619047619047619
Episode 9, Reward: -3.380952380952381
Episode 10, Reward: -4.285714285714286
Episode 11, Reward: -5.0476190476190474
Episode 12, Reward: -5.809523809523809
Episode 13, Reward: -6.523809523809524
Episode 14, Reward: -16.19047619047619
Episode 15, Reward: -25.952380952380953
Episode 16, Reward: -26.833333333333336
Episode 17, Reward: -27.595238095238095
Episode 18, Reward: -28.42857142857143
Episode 19, Reward: -38.214285714285715
Episode 20, Reward: -39.0
Episode 21, Reward: -39.85714285714286
Episode 22, Reward: -40.69047619047619
Episode 23, Reward: -41.5
Episode 24, Reward: -42.30952380952381
Episode 25, Reward: -43.04761904761905
Episode 26, Reward: -43.833333333333336
Episode 2

KeyboardInterrupt: 

In [4]:
def create_ppo_agent(model_path, input_shape=(1, 6, 7), n_actions=7):
    model = ActorCritic(input_shape, n_actions)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()

    def agent(obs, config):
        board = np.array(obs['board']).reshape(1, config.rows, config.columns)
        board_tensor = torch.tensor(board, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            logits, _ = model(board_tensor)
            probs = torch.softmax(logits, dim=-1).squeeze()

            
            for col in range(config.columns):
                if obs['board'][col] != 0:
                    probs[col] = 0.0

            if probs.sum() == 0:
                
                action = random.choice([c for c in range(config.columns) if obs['board'][c] == 0])
            else:
                probs = probs / probs.sum()
                action = int(torch.multinomial(probs, 1).item())
        return action

    return agent


In [None]:
torch.save(agent.model.state_dict(), "./trained_models/PPO/ppo_connect4_model_1.pt")

: 

In [7]:
def get_win_percentages(agent1, agent2, n_rounds=100):
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    
    # Agent 1 goes first half the time
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    
    # Agent 2 goes first the other half 
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]

    total = len(outcomes)
    agent1_wins = outcomes.count([1, -1])
    agent2_wins = outcomes.count([-1, 1])
    ties = outcomes.count([0, 0])
    invalid_1 = outcomes.count([None, 0])
    invalid_2 = outcomes.count([0, None])

    print(f"Total games: {total}")
    print(f"Agent 1 Win Percentage: {agent1_wins / total:.2%}")
    print(f"Agent 2 Win Percentage: {agent2_wins / total:.2%}")
    print(f"Tie Percentage: {ties / total:.2%}")
    print(f"Invalid plays by Agent 1: {invalid_1}")
    print(f"Invalid plays by Agent 2: {invalid_2}")

In [6]:

double_dql_agent_b1 = create_dql_agent("./trained_models/double_dql_blockcheck_1.pt")
double_dql_agent_b1e = create_dql_agent("./trained_models/double_dql_blockcheck_1e.pt")
dueling_dql_agent_1 = create_dueling_dql_agent("./trained_models/dueling_dql_blockcheck_1.pt")
double_dql_agent_b1f = create_dql_agent("./trained_models/double_dql_blockcheck_1f.pt")
double_dql_agent_b1g = create_dql_agent("./trained_models/double_dql_b1g_1M_episodes.pt")
ppo_agent_1 = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1.pt")

In [10]:
get_win_percentages("negamax", ppo_agent_1, n_rounds=200)

Total games: 200
Agent 1 Win Percentage: 95.00%
Agent 2 Win Percentage: 3.50%
Tie Percentage: 1.50%
Invalid plays by Agent 1: 0
Invalid plays by Agent 2: 0
