In [1]:
!pip install pettingzoo

Collecting pettingzoo
  Downloading pettingzoo-1.25.0-py3-none-any.whl.metadata (8.9 kB)
Downloading pettingzoo-1.25.0-py3-none-any.whl (852 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m852.5/852.5 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pettingzoo
Successfully installed pettingzoo-1.25.0


In [2]:
import functools

import gymnasium
import numpy as np
from gymnasium.spaces import Discrete, Tuple
from gymnasium.utils import seeding
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
from pettingzoo.utils.agent_selector import agent_selector
import random
import matplotlib.pyplot as plt

In [13]:
class RockPaperScissorsEnv(AECEnv):
    metadata = {"name":"RockPaperScissorsEnv_v0"}
    def __init__(self):
        super().__init__()
        self.possible_agents = ["player_0","player_1"]
        self.agent_name_mapping = dict(zip(self.possible_agents,list(range(len(self.possible_agents)))))
        self._action_spaces = {agent: Discrete(3) for agent in self.possible_agents} #0 for Rock, 1 for Paper, 2 for Scissors
        self.observation_spaces = {agent: Discrete(4) for agent in self.possible_agents} #0-2 for Opponent's last move and 3 if he made the first move to mask his move
        self.agents_moves = {}

    def observe(self,agent): #Observe opponents last move but return 3 instead if it is the first move
        opponent = self.possible_agents[1-self.agent_name_mapping[agent]]
        return self.agents_moves.get(opponent,3)

    def reset(self,seed=None,options=None):
        self.agents = self.possible_agents[:]
        self.rewards = {agent:0 for agent in self.agents}
        self._cumulative_rewards = {agent:0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent:{} for agent in self.agents}

        self.agents_moves.clear()

        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.next()

        observation = self.observe(self.agent_selection)
        return observation, self.infos[self.agent_selection]

    def step(self,action):
        current_agent = self.agent_selection

        if self.terminations[current_agent]:
            self._was_dead_step(action)
            return

        self.agents_moves[current_agent] = action

        if self._agent_selector.is_last(): #Resolve round if last agent has moved
            if self.agents_moves["player_0"] == self.agents_moves["player_1"]: #Tie
                self.rewards["player_0"] = 0
                self.rewards["player_1"] = 0
            elif self.agents_moves["player_0"] == 0 and self.agents_moves["player_1"] == 1:#If player_0 picks rock and player_1 picks paper
                self.rewards["player_0"] = -1
                self.rewards["player_1"] = 1
            elif self.agents_moves["player_0"] == 1 and self.agents_moves["player_1"] == 2:#If player_0 picks paper and player_1 picks scissors
                self.rewards["player_0"] = -1
                self.rewards["player_1"] = 1
            elif self.agents_moves["player_0"] == 2 and self.agents_moves["player_1"] == 0:#If player_0 picks scissors and player_1 picks rock
                self.rewards["player_0"] = -1
                self.rewards["player_1"] = 1
            else: #If no win conditions for player_0 and not tie, player_1 wins
                self.rewards["player_0"] = 1
                self.rewards["player_1"] = -1

            self._cumulative_rewards = self.rewards.copy()
            self.terminations = {a: True for a in self.agents}

        self.agent_selection = self._agent_selector.next() #Switch to next agent
        # self._accumulate_rewards()

    def action_space(self,agent):
        return self._action_spaces[agent]

    def observation_space(self,agent):
        return self._observation_spaces[agent]

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import torch.nn.functional as F

LEARNING_RATE = 0.001
GAMMA = 0.99

NUM_EPISODES = 20000
EPSILON_START = 1
EPSILON_END = 0.01
EPSILON_DECAY = 0.9995

In [5]:
if torch.cuda.is_available():
    print("GPU is available!")
    print("Device Name:", torch.cuda.get_device_name(0))
else:
    print("GPU not available, running on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

GPU is available!
Device Name: Tesla T4
Using device: cuda


In [6]:
class QNetwork(nn.Module):
    def __init__(self,input_size,output_size):
        super(QNetwork,self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size,128),
            nn.ReLU(),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,output_size)
        )
    def forward(self,x):
        return self.net(x)

class DQNAgent:
    def __init__(self,input_size,output_size):
        self.action_size = output_size
        self.observation_size = input_size
        self.q_network = QNetwork(input_size=input_size,output_size=output_size).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters(),lr=LEARNING_RATE)
        self.loss_fn = nn.MSELoss()
        self.epsilon = EPSILON_START
    def choose_action(self,state):
        if random.random() < self.epsilon:
            return random.randint(0,self.action_size-1) #Choose random action
        else:
            with torch.no_grad():
                state_tensor = torch.LongTensor([state])
                state_one_hot = F.one_hot(state_tensor,num_classes = self.observation_size).float().to(device)
                q_values = self.q_network(state_one_hot.to(device))
                return torch.argmax(q_values).item()

    def learn(self,state,action,reward,next_state,done):
        state_tensor = torch.LongTensor([state])
        state_one_hot = F.one_hot(state_tensor,num_classes = self.observation_size).float()
        next_state_tensor = torch.LongTensor([next_state])
        next_state_one_hot = F.one_hot(next_state_tensor,num_classes = self.observation_size).float()
        action_tensor = torch.LongTensor([action])
        reward_tensor = torch.FloatTensor([reward])
        done_tensor = torch.BoolTensor([done])

        #Moving tensors to GPU
        state_one_hot = state_one_hot.to(device)
        next_state_one_hot = next_state_one_hot.to(device)
        action_tensor = action_tensor.to(device)
        reward_tensor = reward_tensor.to(device)
        done_tensor = done_tensor.to(device)

        current_q_values = self.q_network(state_one_hot)
        current_q_for_action = current_q_values.gather(1,action_tensor.unsqueeze(1)).squeeze(1)

        with torch.no_grad():
            next_q_values = self.q_network(next_state_one_hot)
            max_next_q = next_q_values.max(1)[0]
            target_q = reward_tensor + (~done_tensor) * GAMMA * max_next_q

        loss = self.loss_fn(current_q_for_action,target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def decay_epsilon(self):
        self.epsilon = max(EPSILON_END,self.epsilon*EPSILON_DECAY)

In [7]:
!pip install tqdm



In [8]:
from tqdm.notebook import tqdm
import time

In [14]:
if __name__ == "__main__":
    env = RockPaperScissorsEnv()
    agents = {agent_id: DQNAgent(input_size=4, output_size=3) for agent_id in env.possible_agents}
    total_rewards = {agent_id: 0 for agent_id in env.possible_agents}

    print("Start training")
    progress_bar = tqdm(range(NUM_EPISODES), desc="Training Progress")

    for episode in progress_bar:
        observation, info = env.reset()
        experience_buffer = {}
        episode_rewards = {agent_id: 0 for agent_id in env.possible_agents}

        while env.agents:
            agent_id = env.agent_selection
            observation, reward, termination, truncation, info = env.last()

            if termination or truncation:
                exp = experience_buffer[agent_id]
                agents[agent_id].learn(exp['state'], exp['action'], reward, observation, True)
                episode_rewards[agent_id] = reward
                env.step(None)
                continue

            current_agent = agents[agent_id]
            action = current_agent.choose_action(observation)
            experience_buffer[agent_id] = {"state": observation, "action": action}
            env.step(action)

        for agent in agents.values():
            agent.decay_epsilon()

        for agent_id in env.possible_agents:
            total_rewards[agent_id] += episode_rewards[agent_id]

        if (episode + 1) % 1000 == 0:
            avg_reward_p0 = total_rewards['player_0'] / 1000
            avg_reward_p1 = total_rewards['player_1'] / 1000
            progress_bar.set_description(f"Ep {episode+1} | P0 R: {avg_reward_p0:.3f} | P1 R: {avg_reward_p1:.3f}")
            total_rewards = {agent_id: 0 for agent_id in env.possible_agents}

Start training


Training Progress:   0%|          | 0/20000 [00:00<?, ?it/s]

In [19]:
from tqdm.notebook import tqdm

num_games = 10000
for agent in agents.values():
  agent.epsilon = 0

stats = {"player_0_wins": 0, "player_1_wins": 0, "ties": 0}

print("\n--- Starting Final Evaluation ---")
for _ in tqdm(range(num_games), desc="Evaluating"):
  observation, info = env.reset()
  moves = {}

  while env.agents:
      agent_id = env.agent_selection
      observation, reward, termination, truncation, info = env.last()

      if termination or truncation:
          env.step(None)
          continue

      action = agents[agent_id].choose_action(observation)
      moves[agent_id] = action
      env.step(action)

  p0_move = moves["player_0"]
  p1_move = moves["player_1"]

  if p0_move == p1_move: # Tie
      stats["ties"] += 1
  elif p0_move == 0 and p1_move == 1: # P0 Rock, P1 Paper
      stats["player_1_wins"] += 1
  elif p0_move == 1 and p1_move == 2: # P0 Paper, P1 Scissors
      stats["player_1_wins"] += 1
  elif p0_move == 2 and p1_move == 0: # P0 Scissors, P1 Rock
      stats["player_1_wins"] += 1
  else: # Otherwise, Player 0 wins
      stats["player_0_wins"] += 1

print(f"\n--- Final Evaluation Results ({num_games} games) ---")
print(f"Player 0 Wins: {stats['player_0_wins']} ({stats['player_0_wins']/num_games:.2%})")
print(f"Player 1 Wins: {stats['player_1_wins']} ({stats['player_1_wins']/num_games:.2%})")
print(f"Ties: {stats['ties']} ({stats['ties']/num_games:.2%})")


--- Starting Final Evaluation ---


Evaluating:   0%|          | 0/10000 [00:00<?, ?it/s]


--- Final Evaluation Results (10000 games) ---
Player 0 Wins: 0 (0.00%)
Player 1 Wins: 10000 (100.00%)
Ties: 0 (0.00%)


In [20]:
import matplotlib.pyplot as plt
import numpy as np

agent = agents['player_1']
agent.epsilon = 0
states = [0, 1, 2, 3]
state_labels = ["Opponent threw ROCK", "Opponent threw PAPER", "Opponent threw SCISSORS", "FIRST MOVE"]

policy = {}
for state, label in zip(states, state_labels):
    action = agent.choose_action(state)
    policy[label] = ["Rock", "Paper", "Scissors"][action]

print("\n--- Agent's Deterministic Policy ---")
for situation, move in policy.items():
    print(f"If... {situation}, Then I will play... {move}")


--- Agent's Deterministic Policy ---
If... Opponent threw ROCK, Then I will play... Paper
If... Opponent threw PAPER, Then I will play... Scissors
If... Opponent threw SCISSORS, Then I will play... Rock
If... FIRST MOVE, Then I will play... Rock
