# Introduction

Forked from the tutorial notebook. In the tutorial, you learned a bit about reinforcement learning and used the `stable-baselines3` package to train an agent to beat a random opponent. Now we will create a combined agent which uses both Deep Reinforcement Learning & MiniMax with alpha-beta pruning

**References & Credits:**

* Pretrained models: https://www.kaggle.com/code/salmtcat/ppo-vs-a2c-vs-dqn-and-mlppolicy-vs-cnnpolicy/notebook
* Minimax: https://www.kaggle.com/code/syedjarullahhisham/connectx-n-step-lookahead-minimax

In [1]:
from learntools.core import binder
binder.bind(globals())
from learntools.game_ai.ex4 import *

In [2]:
import gym
from kaggle_environments import make, evaluate
from gym import spaces

import random
import os
import numpy as np
import pandas as pd
import torch as th
from torch import nn as nn
import torch.nn.functional as F
import torch

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# !pip install "stable-baselines3"
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DQN

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import load_results
from stable_baselines3.common.torch_layers import NatureCNN
from stable_baselines3.common.policies import ActorCriticPolicy, ActorCriticCnnPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

import warnings
warnings.filterwarnings('ignore')

No pygame installed, ignoring import


# Setup Environment

In [3]:
class ConnectFourGym(gym.Env):
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = gym.spaces.Discrete(self.columns)
        self.observation_space = gym.spaces.Box(low=0, high=2, 
                                            shape=(1,self.rows,self.columns), dtype=int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns), reward, done, _

In [4]:
# Create ConnectFour environment 
env = ConnectFourGym(agent2='negamax')

# Create directory for logging training information
log_dir = "log/"
os.makedirs(log_dir, exist_ok=True)

# Vectorize the environment so that algorithms work correctly 
# env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

# Win Percentage Calculation

In [5]:
from kaggle_environments import make, evaluate
import numpy as np

def evaluate_agent(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    
    agent1_win = np.round(outcomes.count([1,-1])/len(outcomes), 2)
    agent2_win = np.round(outcomes.count([-1,1])/len(outcomes), 2)
    print("Agent 1 Win Percentage:", agent1_win)
    print("Agent 2 Win Percentage:", agent2_win)
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

# Setup Architecture

In [6]:
# Neural network for predicting action values
class CustomCNN(BaseFeaturesExtractor):
    
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int=128):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # CxHxW images (channels first)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
)

# Train Model

In [7]:
def train_model(model, name):
    
    # First run -> untrained model, afterwards -> trained
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    best_mean_reward = mean_reward
    
    eval_data_df = pd.DataFrame()

    for i in range(1,5):
        print('Learning step:',i)
        model.learn(total_timesteps=12000,reset_num_timesteps=False)
        print('evaluate_policy:',i)
        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
        print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

        if mean_reward > best_mean_reward:
            print('Improved!')
            best_mean_reward = mean_reward
            model.save(f"{name}")
        else:
            print(f"Current best:{best_mean_reward:.2f}")
        
        row_df = pd.DataFrame({'mean_reward': [mean_reward],
                               'std_reward': [std_reward],
                               'iteration':[i]},index=[i])

        if eval_data_df.empty:
            eval_data_df = row_df
        else:
            eval_data_df=pd.concat([eval_data_df,row_df])
            eval_data_df['mean_roll_average'] = eval_data_df.mean_reward.rolling(15, min_periods=1).mean()

            eval_data_df['mean_roll_average']=eval_data_df['mean_roll_average'].fillna(eval_data_df['mean_reward'])

            plt.figure(figsize=(12,3))
            sns.lineplot(x=eval_data_df['iteration'],y=eval_data_df['mean_reward'], label="reward")
            sns.lineplot(x=eval_data_df['iteration'],y=eval_data_df['mean_roll_average'], label="average");
            plt.show()

    return model

# Different Agents Builder

# DQN

## MLP Policy 

In [8]:
def train_DQN_MLP():
    DQN_MLP = DQN('MlpPolicy', env, policy_kwargs=policy_kwargs)

    if os.path.exists('DQN_MLP.zip'):
        print('file found')
        DQN_MLP = DQN.load('DQN_MLP', env=env)
        
    print('DQN MLP')
    DQN_MLP = train_model(DQN_MLP,'DQN MLP')
    print(DQN_MLP.policy)

def DQN_MLP_Agent(obs, config):
    obs = np.array(obs['board']).reshape(1, config.rows, config.columns)
    action, _ = DQN_MLP.predict(obs)
    return int(action)

## CNN Policy 

In [9]:
def train_DQN_CNN():
    DQN_CNN = DQN('CnnPolicy', env, policy_kwargs=policy_kwargs)

    if os.path.exists('DQN_CNN.zip'):
        print('file found')
        DQN_CNN = DQN.load('DQN_CNN', env=env)
    
    print('DQN CNN')
    DQN_CNN = train_model(DQN_CNN,'DQN CNN')    
    print(DQN_CNN.policy)

def DQN_CNN_Agent(obs, config):
    obs = np.array(obs['board']).reshape(1, config.rows, config.columns)
    action, _ = DQN_CNN.predict(obs)
    return int(action)

# PPO

## MLP Policy 

In [10]:
def train_PPO_MLP():
    PPO_MLP = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs)

    if os.path.exists('PPO_MLP.zip'):
        print('file found')
        PPO_MLP = PPO.load('PPO_MLP', env=env)
    
    print('PPO MLP')
    PPO_MLP = train_model(PPO_MLP,'PPO MLP')    
    print(PPO_MLP.policy)

def PPO_MLP_Agent(obs, config):
    obs = np.array(obs['board']).reshape(1, config.rows, config.columns)
    action, _ = PPO_MLP.predict(obs)
    return int(action)

## CNN Policy 

In [11]:
def train_PPO_CNN():
    PPO_CNN = PPO('CnnPolicy', env, policy_kwargs=policy_kwargs)

    if os.path.exists('PPO_CNN.zip'):
        print('file found')
        PPO_CNN = PPO.load('PPO_CNN', env=env)

    print('PPO CNN')
    PPO_CNN = train_model(PPO_CNN,'PPO CNN')
    print(PPO_CNN.policy)

def PPO_CNN_Agent(obs, config):
    obs = np.array(obs['board']).reshape(1, config.rows, config.columns)
    action, _ = PPO_CNN.predict(obs)
    return int(action)

# Train Different Agents

This should only be run first time. After that, we will just load previously trained agents

In [None]:
train_DQN_MLP()
train_DQN_CNN()
train_PPO_MLP()
train_PPO_CNN()

# Load Saved Agents

Stable baseline only supports zip files to load. If this notebook is run for training all agents, then that's not a problem as all the trained agents saved as zip in working folder. But, training agent each time requires hours to run. So, I added the pretrained agents in the input folder from previous version of notebook. Here is the catch. By default, Kaggle unzip the zipped folders automatically while adding zip files. But stable baseline can only process zip files. That's why, I first make zip files of the models at the working folder using shutil library, then load from there. Here is the code

In [12]:
import shutil
shutil.make_archive("DQN_MLP", 'zip', "/kaggle/input/deep-rl-connectx/DQN MLP")
shutil.make_archive("DQN_CNN", 'zip', "/kaggle/input/deep-rl-connectx/DQN CNN")
shutil.make_archive("PPO_MLP", 'zip', "/kaggle/input/deep-rl-connectx/PPO MLP")
shutil.make_archive("PPO_CNN", 'zip', "/kaggle/input/deep-rl-connectx/PPO CNN")

'/kaggle/working/PPO_CNN.zip'

In [13]:
DQN_MLP = DQN('MlpPolicy', env, policy_kwargs=policy_kwargs)
DQN_MLP.set_parameters('DQN_MLP')
DQN_CNN = DQN('CnnPolicy', env, policy_kwargs=policy_kwargs)
DQN_CNN.set_parameters('DQN_CNN')

PPO_MLP = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs)
PPO_MLP.set_parameters('PPO_MLP')
PPO_CNN = PPO('CnnPolicy', env, policy_kwargs=policy_kwargs)
PPO_CNN.set_parameters('PPO_CNN')

# War Between Agents

## Between RL Saved Agents 

In [21]:
num_episodes = 1000

print('PPO_MLP_Agent vs PPO_CNN_Agent')
evaluate_agent(PPO_MLP_Agent, PPO_CNN_Agent, n_rounds=num_episodes)
print('DQN_MLP_Agent vs DQN_CNN_Agent')
evaluate_agent(DQN_MLP_Agent, DQN_CNN_Agent, n_rounds=num_episodes)

print('PPO_MLP_Agent vs DQN_MLP_Agent')
evaluate_agent(PPO_MLP_Agent, DQN_MLP_Agent, n_rounds=num_episodes)
print('PPO_CNN_Agent vs DQN_CNN_Agent')
evaluate_agent(PPO_CNN_Agent, DQN_CNN_Agent, n_rounds=num_episodes)

print('PPO_MLP_Agent vs DQN_CNN_Agent')
evaluate_agent(PPO_MLP_Agent, DQN_CNN_Agent, n_rounds=num_episodes)
print('PPO_CNN_Agent vs DQN_MLP_Agent')
evaluate_agent(PPO_CNN_Agent, DQN_MLP_Agent, n_rounds=num_episodes)

PPO_MLP_Agent vs PPO_CNN_Agent
Agent 1 Win Percentage: 0.43
Agent 2 Win Percentage: 0.42
Number of Invalid Plays by Agent 1: 73
Number of Invalid Plays by Agent 2: 71
DQN_MLP_Agent vs DQN_CNN_Agent
Agent 1 Win Percentage: 0.5
Agent 2 Win Percentage: 0.5
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0
PPO_MLP_Agent vs DQN_MLP_Agent
Agent 1 Win Percentage: 0.02
Agent 2 Win Percentage: 0.88
Number of Invalid Plays by Agent 1: 3
Number of Invalid Plays by Agent 2: 100
PPO_CNN_Agent vs DQN_CNN_Agent
Agent 1 Win Percentage: 0.03
Agent 2 Win Percentage: 0.95
Number of Invalid Plays by Agent 1: 1
Number of Invalid Plays by Agent 2: 19
PPO_MLP_Agent vs DQN_CNN_Agent
Agent 1 Win Percentage: 0.02
Agent 2 Win Percentage: 0.97
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 3
PPO_CNN_Agent vs DQN_MLP_Agent
Agent 1 Win Percentage: 0.04
Agent 2 Win Percentage: 0.8
Number of Invalid Plays by Agent 1: 7
Number of Invalid Plays by Agent 2: 155


## RL Agents Against random agents 

In [16]:
print('PPO_MLP_Agent vs Random Agent')
evaluate_agent(PPO_MLP_Agent, "random", n_rounds=num_episodes)
print('PPO_CNN_Agent vs Random Agent')
evaluate_agent(PPO_CNN_Agent, "random", n_rounds=num_episodes)
print('DQN_MLP_Agent vs Random Agent')
evaluate_agent(DQN_MLP_Agent, "random", n_rounds=num_episodes)
print('DQN_CNN_Agent vs Random Agent')
evaluate_agent(DQN_CNN_Agent, "random", n_rounds=num_episodes)

PPO_MLP_Agent vs Random Agent
Agent 1 Win Percentage: 0.56
Agent 2 Win Percentage: 0.35
Number of Invalid Plays by Agent 1: 9
Number of Invalid Plays by Agent 2: 0
PPO_CNN_Agent vs Random Agent
Agent 1 Win Percentage: 0.58
Agent 2 Win Percentage: 0.3
Number of Invalid Plays by Agent 1: 12
Number of Invalid Plays by Agent 2: 0
DQN_MLP_Agent vs Random Agent
Agent 1 Win Percentage: 0.72
Agent 2 Win Percentage: 0.0
Number of Invalid Plays by Agent 1: 28
Number of Invalid Plays by Agent 2: 0
DQN_CNN_Agent vs Random Agent
Agent 1 Win Percentage: 0.63
Agent 2 Win Percentage: 0.05
Number of Invalid Plays by Agent 1: 32
Number of Invalid Plays by Agent 2: 0


## RL Agents Against negamax agents 

In [18]:
num_episodes = 100  # as these tests take time to run
print('PPO_MLP_Agent vs Negamax Agent')
evaluate_agent(PPO_MLP_Agent, "negamax", n_rounds=num_episodes)
print('PPO_CNN_Agent vs Negamax Agent')
evaluate_agent(PPO_CNN_Agent, "negamax", n_rounds=num_episodes)
print('DQN_MLP_Agent vs Negamax Agent')
evaluate_agent(DQN_MLP_Agent, "negamax", n_rounds=num_episodes)
print('DQN_CNN_Agent vs Negamax Agent')
evaluate_agent(DQN_CNN_Agent, "negamax", n_rounds=num_episodes)

PPO_MLP_Agent vs Negamax Agent
Agent 1 Win Percentage: 0.01
Agent 2 Win Percentage: 0.99
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0
PPO_CNN_Agent vs Negamax Agent
Agent 1 Win Percentage: 0.01
Agent 2 Win Percentage: 0.97
Number of Invalid Plays by Agent 1: 2
Number of Invalid Plays by Agent 2: 0
DQN_MLP_Agent vs Negamax Agent
Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.51
Number of Invalid Plays by Agent 1: 49
Number of Invalid Plays by Agent 2: 0
DQN_CNN_Agent vs Negamax Agent
Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.06
Number of Invalid Plays by Agent 1: 94
Number of Invalid Plays by Agent 2: 0


## Against My Last Best Agent(Minimax Agent)

I created agents in previous tutorials from which the minimax with alpha-beta pruning runs best. We will use it as custom agent to evaluate our new agent

In [19]:
def my_minimax_agent(obs, config):
    import random
    import numpy as np
    import multiprocessing
    from functools import partial
    
    # How deep to make the game tree: higher values take longer to run!
    N_STEPS = 3
    
    # Helper function for minimax: calculates value of heuristic for grid
    def get_heuristic(grid, mark, config):
        A = 1e6
        B = 1e2
        C = 1
        D = -1e3
        E = -1e11
        num_twos = count_windows(grid, 2, mark, config)
        num_threes = count_windows(grid, 3, mark, config)
        num_fours = count_windows(grid, 4, mark, config)
        num_threes_opp = count_windows(grid, 3, mark%2+1, config)
        num_fours_opp = count_windows(grid, 4, mark%2+1, config)
        score = D*num_threes_opp + E*num_fours_opp + A*num_fours + B*num_threes + C*num_twos
        return score
    
    # Helper function for get_heuristic: checks if window satisfies heuristic conditions
    def check_window(window, num_discs, piece, config):
        return (window.count(piece) == num_discs and window.count(0) == config.inarow-num_discs)

    # Helper function for get_heuristic: counts number of windows satisfying specified heuristic conditions
    def count_windows(grid, num_discs, piece, config):
        num_windows = 0
        # horizontal
        for row in range(config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[row, col:col+config.inarow])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # vertical
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns):
                window = list(grid[row:row+config.inarow, col])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # positive diagonal
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row+config.inarow), range(col, col+config.inarow)])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # negative diagonal
        for row in range(config.inarow-1, config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row-config.inarow, -1), range(col, col+config.inarow)])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        return num_windows
    
    # Helper function for score_move: gets board at next step if agent drops piece in selected column
    def drop_piece(grid, col, mark, config):
        next_grid = grid.copy()
        for row in range(config.rows-1, -1, -1):
            if next_grid[row][col] == 0:
                break
        next_grid[row][col] = mark
        return next_grid
    
    # Uses minimax to calculate value of dropping piece in selected column
    def score_move(grid, col, mark, config, nsteps):
        next_grid = drop_piece(grid, col, mark, config)
        score = minimax(next_grid, nsteps-1, False, mark, config,-np.inf, np.inf)
        return score

    # Helper function for minimax: checks if agent or opponent has four in a row in the window
    def is_terminal_window(window, config):
        return window.count(1) == config.inarow or window.count(2) == config.inarow

    # Helper function for minimax: checks if game has ended
    def is_terminal_node(grid, config):
        # Check for draw 
        if list(grid[0, :]).count(0) == 0:
            return True
        # Check for win: horizontal, vertical, or diagonal
        # horizontal 
        for row in range(config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[row, col:col+config.inarow])
                if is_terminal_window(window, config):
                    return True
        # vertical
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns):
                window = list(grid[row:row+config.inarow, col])
                if is_terminal_window(window, config):
                    return True
        # positive diagonal
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row+config.inarow), range(col, col+config.inarow)])
                if is_terminal_window(window, config):
                    return True
        # negative diagonal
        for row in range(config.inarow-1, config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row-config.inarow, -1), range(col, col+config.inarow)])
                if is_terminal_window(window, config):
                    return True
        return False

    # Minimax implementation
    def minimax(node, depth, maximizingPlayer, mark, config, alpha, beta):
        is_terminal = is_terminal_node(node, config)
        valid_moves = [c for c in range(config.columns) if node[0][c] == 0]
        if depth == 0 or is_terminal:
            return get_heuristic(node, mark, config)
        if maximizingPlayer:
            value = -np.Inf
            for col in valid_moves:
                child = drop_piece(node, col, mark, config)
                value = max(value, minimax(child, depth-1, False, mark, config, alpha, beta))
                alpha = max(alpha, value)
                if beta <= alpha:
                    break
            return value
        else:
            value = np.Inf
            for col in valid_moves:
                child = drop_piece(node, col, mark%2+1, config)
                value = min(value, minimax(child, depth-1, True, mark, config, alpha, beta))
                beta = min(beta, value)
                if beta <= alpha:
                    break
            return value
        
    # Get list of valid moves
    valid_moves = [c for c in range(config.columns) if obs.board[c] == 0]
    # Convert the board to a 2D grid
    grid = np.asarray(obs.board).reshape(config.rows, config.columns)
    # Use the heuristic to assign a score to each possible board in the next step
    scores = dict(zip(valid_moves, [score_move(grid, col, obs.mark, config, N_STEPS) for col in valid_moves]))
    # Get a list of columns (moves) that maximize the heuristic
    max_cols = [key for key in scores.keys() if scores[key] == max(scores.values())]
    # Select at random from the maximizing columns
    return random.choice(max_cols)

In [20]:
print('PPO_MLP_Agent vs Minimax Agent')
evaluate_agent(PPO_MLP_Agent, my_minimax_agent, n_rounds=num_episodes)
print('PPO_CNN_Agent vs Minimax Agent')
evaluate_agent(PPO_CNN_Agent, my_minimax_agent, n_rounds=num_episodes)
print('DQN_MLP_Agent vs Minimax Agent')
evaluate_agent(DQN_MLP_Agent, my_minimax_agent, n_rounds=num_episodes)
print('DQN_CNN_Agent vs Minimax Agent')
evaluate_agent(DQN_CNN_Agent, my_minimax_agent, n_rounds=num_episodes)

PPO_MLP_Agent vs Minimax Agent
Agent 1 Win Percentage: 0.01
Agent 2 Win Percentage: 0.99
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0
PPO_CNN_Agent vs Minimax Agent
Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 1.0
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0
DQN_MLP_Agent vs Minimax Agent
Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.2
Number of Invalid Plays by Agent 1: 80
Number of Invalid Plays by Agent 2: 0
DQN_CNN_Agent vs Minimax Agent
Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.62
Number of Invalid Plays by Agent 1: 38
Number of Invalid Plays by Agent 2: 0


# Best RL Agent With MiniMax Submission Preparation

https://www.kaggle.com/code/salmtcat/ppo-vs-a2c-vs-dqn-and-mlppolicy-vs-cnnpolicy/notebook#Submission-file

Here we will combine our best RL agent DQN_MLP with our previous best minimax agent as submission agent. As we see our RL agent take a lot of invalid moves. We will replace those invalid moves with minimax moves.

In [24]:
%%writefile submission.py

def agent(obs, config):
    import numpy as np
    import torch as th
    from torch import nn as nn
    import torch.nn.functional as F
    from torch import tensor
    import random
    
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.cnn0 = nn.Conv2d(1, 32, kernel_size=3)
            self.cnn2 = nn.Conv2d(32, 64, kernel_size=3)
            self.linear0 = nn.Linear(384, 128)
            self.qnet0 = nn.Linear(128, 64)
            self.qnet2 = nn.Linear(64, 64)
            self.qnet4 = nn.Linear(64, 7)

        def forward(self, x):
            x = F.relu(self.cnn0(x))
            x = F.relu(self.cnn2(x))
            x = nn.Flatten()(x)
            x = F.relu(self.linear0(x))
            x = F.relu(self.qnet0(x))
            x = F.relu(self.qnet2(x))
            x = self.qnet4(x)
            x = x.argmax()
            return x
        
        
        #PPO
        # will be slightly different

Overwriting submission.py


In [23]:
DQN_MLP.policy.state_dict().keys()

odict_keys(['q_net.features_extractor.cnn.0.weight', 'q_net.features_extractor.cnn.0.bias', 'q_net.features_extractor.cnn.2.weight', 'q_net.features_extractor.cnn.2.bias', 'q_net.features_extractor.linear.0.weight', 'q_net.features_extractor.linear.0.bias', 'q_net.q_net.0.weight', 'q_net.q_net.0.bias', 'q_net.q_net.2.weight', 'q_net.q_net.2.bias', 'q_net.q_net.4.weight', 'q_net.q_net.4.bias', 'q_net_target.features_extractor.cnn.0.weight', 'q_net_target.features_extractor.cnn.0.bias', 'q_net_target.features_extractor.cnn.2.weight', 'q_net_target.features_extractor.cnn.2.bias', 'q_net_target.features_extractor.linear.0.weight', 'q_net_target.features_extractor.linear.0.bias', 'q_net_target.q_net.0.weight', 'q_net_target.q_net.0.bias', 'q_net_target.q_net.2.weight', 'q_net_target.q_net.2.bias', 'q_net_target.q_net.4.weight', 'q_net_target.q_net.4.bias'])

In [25]:
th.set_printoptions(profile="full")

agent_path = 'submission.py'

state_dict = DQN_MLP.policy.to('cpu').state_dict()
state_dict = {
    'cnn0.weight': state_dict['q_net.features_extractor.cnn.0.weight'],
    'cnn0.bias': state_dict['q_net.features_extractor.cnn.0.bias'],
    'cnn2.weight': state_dict['q_net.features_extractor.cnn.2.weight'],
    'cnn2.bias': state_dict['q_net.features_extractor.cnn.2.bias'],
    
    'linear0.weight': state_dict['q_net.features_extractor.linear.0.weight'],
    'linear0.bias': state_dict['q_net.features_extractor.linear.0.bias'],

    'qnet0.weight': state_dict['q_net.q_net.0.weight'],
    'qnet0.bias': state_dict['q_net.q_net.0.bias'],
    'qnet2.weight': state_dict['q_net.q_net.2.weight'],
    'qnet2.bias': state_dict['q_net.q_net.2.bias'],
    'qnet4.weight': state_dict['q_net.q_net.4.weight'],
    'qnet4.bias': state_dict['q_net.q_net.4.bias'],
    
    # PPO will be slightly different according to found policies
}

with open(agent_path, mode='a') as file:
    file.write(f'    state_dict = {state_dict}\n')
    
print('saved state_dict')

saved state_dict


In [26]:
%%writefile -a submission.py

        
    # How deep to make the game tree: higher values take longer to run!
    N_STEPS = 3
    
    # Helper function for score_move: gets board at next step if agent drops piece in selected column
    def drop_piece(grid, col, mark, config):
        next_grid = grid.copy()
        for row in range(config.rows-1, -1, -1):
            if next_grid[row][col] == 0:
                break
        next_grid[row][col] = mark
        return next_grid
    
    # Helper function for minimax: calculates value of heuristic for grid
    def get_heuristic(grid, mark, config):
        A = 1e6
        B = 1e2
        C = 1
        D = -1e3
        E = -1e11
        num_twos = count_windows(grid, 2, mark, config)
        num_threes = count_windows(grid, 3, mark, config)
        num_fours = count_windows(grid, 4, mark, config)
        num_threes_opp = count_windows(grid, 3, mark%2+1, config)
        num_fours_opp = count_windows(grid, 4, mark%2+1, config)
        score = D*num_threes_opp + E*num_fours_opp + A*num_fours + B*num_threes + C*num_twos
        return score

    # Helper function for get_heuristic: checks if window satisfies heuristic conditions
    def check_window(window, num_discs, piece, config):
        return (window.count(piece) == num_discs and window.count(0) == config.inarow-num_discs)

    # Helper function for get_heuristic: counts number of windows satisfying specified heuristic conditions
    def count_windows(grid, num_discs, piece, config):
        num_windows = 0
        # horizontal
        for row in range(config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[row, col:col+config.inarow])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # vertical
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns):
                window = list(grid[row:row+config.inarow, col])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # positive diagonal
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row+config.inarow), range(col, col+config.inarow)])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        # negative diagonal
        for row in range(config.inarow-1, config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row-config.inarow, -1), range(col, col+config.inarow)])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        return num_windows

    # Uses minimax to calculate value of dropping piece in selected column
    def score_move(grid, col, mark, config, nsteps):
        next_grid = drop_piece(grid, col, mark, config)
        score = minimax(next_grid, nsteps-1, False, mark, config,-np.inf, np.inf)
        return score

    # Helper function for minimax: checks if agent or opponent has four in a row in the window
    def is_terminal_window(window, config):
        return window.count(1) == config.inarow or window.count(2) == config.inarow

    # Helper function for minimax: checks if game has ended
    def is_terminal_node(grid, config):
        # Check for draw 
        if list(grid[0, :]).count(0) == 0:
            return True
        # Check for win: horizontal, vertical, or diagonal
        # horizontal 
        for row in range(config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[row, col:col+config.inarow])
                if is_terminal_window(window, config):
                    return True
        # vertical
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns):
                window = list(grid[row:row+config.inarow, col])
                if is_terminal_window(window, config):
                    return True
        # positive diagonal
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row+config.inarow), range(col, col+config.inarow)])
                if is_terminal_window(window, config):
                    return True
        # negative diagonal
        for row in range(config.inarow-1, config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(grid[range(row, row-config.inarow, -1), range(col, col+config.inarow)])
                if is_terminal_window(window, config):
                    return True
        return False

    # Minimax implementation
    def minimax(node, depth, maximizingPlayer, mark, config, alpha, beta):
        is_terminal = is_terminal_node(node, config)
        valid_moves = [c for c in range(config.columns) if node[0][c] == 0]
        if depth == 0 or is_terminal:
            return get_heuristic(node, mark, config)
        if maximizingPlayer:
            value = -np.Inf
            for col in valid_moves:
                child = drop_piece(node, col, mark, config)
                value = max(value, minimax(child, depth-1, False, mark, config, alpha, beta))
                alpha = max(alpha, value)
                if beta <= alpha:
                    break
            return value
        else:
            value = np.Inf
            for col in valid_moves:
                child = drop_piece(node, col, mark%2+1, config)
                value = min(value, minimax(child, depth-1, True, mark, config, alpha, beta))
                beta = min(beta, value)
                if beta <= alpha:
                    break
            return value

    def get_minimax_move(obs, config):
        # Get list of valid moves
        valid_moves = [c for c in range(config.columns) if obs.board[c] == 0]
        # Convert the board to a 2D grid
        grid = np.asarray(obs.board).reshape(config.rows, config.columns)
        # Use the heuristic to assign a score to each possible board in the next step
        scores = dict(zip(valid_moves, [score_move(grid, col, obs.mark, config, N_STEPS) for col in valid_moves]))
        # Get a list of columns (moves) that maximize the heuristic
        max_cols = [key for key in scores.keys() if scores[key] == max(scores.values())]
        # Select at random from the maximizing columns
        return random.choice(max_cols)

    def check_winning_move_grid(grid, config, col, piece):
        next_grid = drop_piece(grid, col, piece, config)
        # horizontal
        for row in range(config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(next_grid[row,col:col+config.inarow])
                if window.count(piece) == config.inarow:
                    return True
        # vertical
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns):
                window = list(next_grid[row:row+config.inarow,col])
                if window.count(piece) == config.inarow:
                    return True
        # positive diagonal
        for row in range(config.rows-(config.inarow-1)):
            for col in range(config.columns-(config.inarow-1)):
                window = list(next_grid[range(row, row+config.inarow), range(col, col+config.inarow)])
                if window.count(piece) == config.inarow:
                    return True
        # negative diagonal
        for row in range(config.inarow-1, config.rows):
            for col in range(config.columns-(config.inarow-1)):
                window = list(next_grid[range(row, row-config.inarow, -1), range(col, col+config.inarow)])
                if window.count(piece) == config.inarow:
                    return True
        return False

    # Returns True if dropping piece in column results in game win
    def check_winning_move(obs, config, col, piece):
        # Convert the board to a 2D grid
        grid = np.asarray(obs.board).reshape(config.rows, config.columns)
        return check_winning_move_grid(grid, config, col, piece)


    def check_action(obs, config, action):
        
        valid_moves = [col for col in range(config.columns) if obs.board[col] == 0]

        # There is a wining move, don't wait for the model
        for col in valid_moves:
            if check_winning_move(obs, config, col, obs.mark):
                return col

        # if got no winning moves, check if your opponent has any winning moves. 
        for col in valid_moves:
            if check_winning_move(obs, config, col, obs.mark%2+1):
                return col
            
        grid = np.asarray(obs.board).reshape(config.rows, config.columns)
        if action in valid_moves:
            # Try out the move and make sure it doesn't hand the enemy a win    
            keep_checking = True
            while keep_checking:
                new_grid = drop_piece(grid, action, obs.mark, config)
                if check_winning_move_grid(new_grid, config, action, obs.mark%2+1):
                    valid_moves.remove(action)
                    if len(valid_moves) > 0:
                        action = get_minimax_move(obs, config)
                    else:
                        keep_checking = False
                else:
                    keep_checking = False

            return int(action)
        else:            
            return get_minimax_move(obs, config)

    model = Net()
    model = model.float()
    model.load_state_dict(state_dict)
    model = model.to('cpu')
    model = model.eval()

    obs_tensor = tensor(obs['board']).reshape(1, 1, config.rows, config.columns).float()
    obs_tensor = obs_tensor #/ 2 # Is the right?
    action = model(obs_tensor)

    action = check_action(obs, config, action)
    
    return int(action)

Appending to submission.py


# Validate Submission & Battle Against Other Best

In [27]:
# load submission.py
f = open(agent_path)
source = f.read()
exec(source)

In [29]:
# Evaluate against negamax
evaluate_agent(agent1=agent, agent2="negamax", n_rounds=100)

Agent 1 Win Percentage: 0.58
Agent 2 Win Percentage: 0.36
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0


In [30]:
# Evaluate against previous best minimax
evaluate_agent(agent1=agent, agent2=my_minimax_agent, n_rounds=100)

Agent 1 Win Percentage: 0.32
Agent 2 Win Percentage: 0.68
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0


In [31]:
# Evaluate against random
evaluate_agent(agent1=agent, agent2="random", n_rounds=100)

Agent 1 Win Percentage: 1.0
Agent 2 Win Percentage: 0.0
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0


# Congratulations!

You have completed the course, and it's time to put your new skills to work!  

The next step is to apply what you've learned to a **[more complex game: Halite](https://www.kaggle.com/c/halite)**.  For a step-by-step tutorial in how to make your first submission to this competition, **[check out the bonus lesson](https://www.kaggle.com/alexisbcook/getting-started-with-halite)**!

You can find more games as they're released on the **[Kaggle Simulations page](https://www.kaggle.com/simulations)**.

As we did in the course, we recommend that you start simple, with an agent that follows your precise instructions.  This will allow you to learn more about the mechanics of the game and to build intuition for what makes a good agent.  Then, gradually increase the complexity of your agents to climb the leaderboard!