In [1]:
from kaggle_environments import evaluate, make, utils
from random import choice
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
import gym
from agents_utils import alphabeta_agent, block_check_agent, create_dql_agent, create_dueling_dql_agent, get_win_percentages, get_win_and_move_stats
from connectx_envs import ConnectFourGym, ConnectFiveGym


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class ActorCritic(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(ActorCritic, self).__init__()
        c, h, w = input_shape

        self.conv1 = nn.Conv2d(c, 64, kernel_size=4, stride=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=2, stride=1)
        conv_out_size = self._get_conv_output(c, h, w)

        self.fc_shared = nn.Linear(conv_out_size, 128)
        self.dropout = nn.Dropout(0.25)

        self.actor = nn.Linear(128, n_actions)   
        self.critic = nn.Linear(128, 1)            

    def _get_conv_output(self, c, h, w):
        x = torch.zeros(1, c, h, w)
        x = self.conv1(x)
        x = self.conv2(x)
        return int(np.prod(x.size()))

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc_shared(x))
        x = self.dropout(x)

        logits = self.actor(x)
        value = self.critic(x)
        return logits, value

class PPOAgent:
    def __init__(self, env, input_shape, n_actions, gamma=0.99, lr=0.0005, clip_eps=0.2, epochs=8, batch_size=64):
        self.env = env
        self.gamma = gamma
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.batch_size = batch_size

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = ActorCritic(input_shape, n_actions).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        logits, value = self.model(state)
        dist = torch.distributions.Categorical(logits=logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item(), value.item()

    def compute_returns_advantages(self, rewards, dones, values, next_value):
        returns = []
        R = next_value
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                R = 0
            R = reward + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns, dtype=torch.float32).to(self.device)
        values = torch.tensor(values, dtype=torch.float32).to(self.device)
        advantages = returns - values
        return returns, advantages

    def update(self, obs, actions, log_probs, returns, advantages):
        obs = torch.tensor(obs, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions).to(self.device)
        old_log_probs = torch.tensor(log_probs, dtype=torch.float32).to(self.device)
        returns = returns.to(self.device)
        advantages = advantages.to(self.device)

        for _ in range(self.epochs):
            logits, values = self.model(obs)
            dist = torch.distributions.Categorical(logits=logits)
            new_log_probs = dist.log_prob(actions)

            ratio = torch.exp(new_log_probs - old_log_probs)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advantages

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = F.mse_loss(values.squeeze(), returns)
            loss = actor_loss + 0.5 * critic_loss

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def train(self, total_timesteps=500_000, rollout_steps=2048):
        state = self.env.reset()
        episode_rewards = []
        total_steps = 0
        episode = 0

        while total_steps < total_timesteps:
            obs_list, action_list, logprob_list, reward_list, done_list, value_list = [], [], [], [], [], []

            for _ in range(rollout_steps):
                action, log_prob, value = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action)

                obs_list.append(state)
                action_list.append(action)
                logprob_list.append(log_prob)
                reward_list.append(reward)
                done_list.append(done)
                value_list.append(value)

                state = next_state
                total_steps += 1

                if done:
                    episode_reward = sum(reward_list)
                    episode += 1
                    print(f"Episode {episode}, Reward: {episode_reward}")
                    episode_rewards.append(episode_reward)
                    state = self.env.reset()

            # Bootstrap value
            with torch.no_grad():
                _, next_value = self.model(torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device))

            returns, advantages = self.compute_returns_advantages(reward_list, done_list, value_list, next_value.item())

            self.update(obs_list, action_list, logprob_list, returns, advantages)

        return episode_rewards

In [4]:
env = ConnectFourGym(agent2=block_check_agent)

input_shape = (1, env.rows, env.columns)   
n_actions = env.columns                    

agent = PPOAgent(env, input_shape, n_actions)

rewards = agent.train(total_timesteps=5_000_000)

Episode 1, Reward: -0.7142857142857143
Episode 2, Reward: -10.452380952380953
Episode 3, Reward: -11.190476190476192
Episode 4, Reward: -11.976190476190476
Episode 5, Reward: -21.666666666666668
Episode 6, Reward: -22.380952380952383
Episode 7, Reward: -23.142857142857146
Episode 8, Reward: -24.071428571428573
Episode 9, Reward: -24.78571428571429
Episode 10, Reward: -34.54761904761905
Episode 11, Reward: -35.47619047619048
Episode 12, Reward: -45.214285714285715
Episode 13, Reward: -54.952380952380956
Episode 14, Reward: -55.69047619047619
Episode 15, Reward: -56.523809523809526
Episode 16, Reward: -66.33333333333334
Episode 17, Reward: -67.14285714285714


KeyboardInterrupt: 

In [3]:
def create_ppo_agent(model_path, input_shape=(1, 6, 7), n_actions=7):
    model = ActorCritic(input_shape, n_actions)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()

    def agent(obs, config):
        board = np.array(obs['board']).reshape(1, config.rows, config.columns)
        board_tensor = torch.tensor(board, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            logits, _ = model(board_tensor)
            probs = torch.softmax(logits, dim=-1).squeeze()

            
            for col in range(config.columns):
                if obs['board'][col] != 0:
                    probs[col] = 0.0

            if probs.sum() == 0:
                
                action = random.choice([c for c in range(config.columns) if obs['board'][c] == 0])
            else:
                probs = probs / probs.sum()
                action = int(torch.multinomial(probs, 1).item())
        return action

    return agent


In [7]:
torch.save(agent.model.state_dict(), "./trained_models/PPO/ppo_connect4_model_1_5m_steps.pt")

In [4]:

double_dql_agent_b1 = create_dql_agent("./trained_models/DQL/double_dql_blockcheck_1.pt")
double_dql_agent_b1e = create_dql_agent("./trained_models/DQL/double_dql_blockcheck_1e.pt")
double_dql_agent_b1f = create_dql_agent("./trained_models/DQL/double_dql_blockcheck_1f.pt")
double_dql_agent_b1g = create_dql_agent("./trained_models/DQL/double_dql_b1g_1M_episodes.pt")
ppo_agent_1 = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1.pt")
ppo_agent_1ff = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1ff.pt")
ppo_agent_1f = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1f.pt")
ppo_agent_32batch = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_32batch.pt")
ppo_agent_64batch = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_64batch.pt")
ppo_agent_128batch = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_128batch.pt")
ppo_agent_256batch = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_256batch.pt")
ppo_agent_2epochs = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_2epochs.pt")
ppo_agent_4epochs = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_64batch.pt")
ppo_agent_6epochs = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_6epochs.pt")
ppo_agent_8epochs = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_8epochs.pt")
ppo_agent_10epochs = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_10epochs.pt")
ppo_agent_10mil_steps = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_10m_steps.pt")
ppo_agent_5mil_steps = create_ppo_agent("./trained_models/PPO/ppo_connect4_model_1_5m_steps.pt")
ppo_agent_connect5_1 = create_ppo_agent("./trained_models/PPO/ppo_connect5_model_1.pt", input_shape=(1, 8, 8),n_actions=8 )
ppo_agent_connect5_1e = create_ppo_agent("./trained_models/PPO/ppo_connect5_model_1e.pt", input_shape=(1, 8, 8),n_actions=8)

In [None]:
get_win_percentages(ppo_agent_connect5_1, "negamax", n_rounds=200)

Total games: 200
Agent 1 Win Percentage: 32.00%
Agent 2 Win Percentage: 68.00%
Tie Percentage: 0.00%
Invalid plays by Agent 1: 0
Invalid plays by Agent 2: 0


In [12]:
get_win_percentages(ppo_agent_connect5_1e, "negamax", n_rounds=200, rows=8, columns=8, inarow=5)

Total games: 200
Agent 1 Win Percentage: 1.00%
Agent 2 Win Percentage: 87.50%
Tie Percentage: 11.50%
Invalid plays by Agent 1: 0
Invalid plays by Agent 2: 0


In [None]:
get_win_and_move_stats(ppo_agent_10mil_steps, alphabeta_agent, n_rounds=100)

Games played: 100
Agent 1 wins: 0
Agent 2 wins: 100
Ties: 0
Total moves by Agent 1: 1924
Total moves by Agent 2: 1924


In [7]:
env = make("connectx", debug=True)
env.reset()

# Play one episode: DQL agent goes first, Negamax goes second
env.run([block_check_agent, "negamax"])

env.render(mode="ipython", width=500, height=450)