- Train Against: Old-version of self  <br>
- Evaluate Against: Mixed agent <br>
- Network A & B

In [3]:
from ConnectFourBoard import ConnectFourBoard
from QLearningAgent import DQNAgent
import numpy as np
import random
import torch
import copy
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class HeuristicOpponent:
    def __init__(self, env):
        self.env = env

    def choose_action(self):
        # Try to win in one move
        for action in self.env.get_available_actions():
            temp_env = copy.deepcopy(self.env)
            temp_env.step(action)
            if temp_env.winner == -1:
                return action

        # Try to block opponent's win in one move
        for action in self.env.get_available_actions():
            temp_env = copy.deepcopy(self.env)
            temp_env.current_player = 1  # Agent's turn in simulated env
            temp_env.step(action)
            if temp_env.winner == 1:
                return action

        # Fallback to center preference (often stronger)
        if 3 in self.env.get_available_actions():
            return 3

        # Otherwise pick random
        return random.choice(self.env.get_available_actions())

In [4]:
def train_agent(agent, episodes=5000, eval_every=1000, eval_games=100, opponent_update_freq=1000):
    rewards = []
    win_percentages = []

    # Create an opponent agent as a frozen copy of the main agent
    opponent_agent = copy.deepcopy(agent)
    opponent_agent.epsilon = 0.0  
    opponent_agent.env = agent.env

    for ep in range(1, episodes + 1):
        state, _ = agent.env.reset()
        done = False
        total_reward = 0

        while not done:
            if agent.env.current_player == 1:
                action = agent.select_action(state)
                next_state, reward, done, _, _ = agent.env.step(action)
                agent.store_transition(state, action, reward, next_state, done)
                agent.update()
                state = next_state
            else:
                action = opponent_agent.select_action(state)
                state, reward, done, _, _ = agent.env.step(action)

        total_reward = agent.env.winner
        rewards.append(total_reward)

        agent.epsilon = max(agent.epsilon_end, agent.epsilon * agent.epsilon_decay)

        # Update target network
        if ep % agent.target_update_freq == 0:
            agent.target_net.load_state_dict(agent.q_net.state_dict())

        # Update opponent agent (self-play reference)
        if ep % opponent_update_freq == 0:
            opponent_agent = copy.deepcopy(agent)
            opponent_agent.epsilon = 0.0  
            opponent_agent.env = agent.env

        # Evaluation vs heuristic opponent
        heuristic_opponent = HeuristicOpponent(agent.env)
        
        if ep % eval_every == 0:
            epsilon_temp = agent.epsilon
            agent.epsilon = 0.0

            win_count = 0
            for _ in range(eval_games):
                state, _ = agent.env.reset()
                done = False
                while not done:
                    if agent.env.current_player == 1:
                        action = agent.select_action(state)
                    else:
                        if np.random.rand() < 0.5:
                            action = heuristic_opponent.choose_action()
                        else:
                            action = np.random.choice(agent.env.get_available_actions())
                    state, reward, done, _, _ = agent.env.step(action)

                if agent.env.winner == 1:
                    win_count += 1

            agent.epsilon = epsilon_temp

            win_percentage = (win_count / eval_games) * 100
            win_percentages.append(win_percentage)
            print(f"Episode {ep}: Agent won {win_percentage:.2f}% vs Mixed Opponent")

    # Plotting
    plt.plot(range(eval_every, episodes + 1, eval_every), win_percentages)
    plt.xlabel("Training Episodes")
    plt.ylabel("Win % vs Random Baseline")
    plt.title("DQN Agent Self-Play Evaluation")
    plt.grid(True)
    plt.show()

    return rewards, win_percentages


In [None]:
eval_every = 1000
episodes = 50000
env = ConnectFourBoard()
opponent = HeuristicOpponent(env)
agent = DQNAgent(env, device="cuda" if torch.cuda.is_available() else "cpu", network="B")
rewards, win_percentages = train_agent(
    agent, episodes=episodes, eval_every=eval_every, eval_games=100
)

Episode 1000: Agent won 39.00% vs Mixed Opponent
Episode 2000: Agent won 36.00% vs Mixed Opponent
Episode 3000: Agent won 42.00% vs Mixed Opponent


In [None]:
# Save the results rewards and win percentages in a file
np.savez(
    "training_results_selfplay_mixed_B.npz",
    rewards=rewards,
    win_percentages=win_percentages,
)