In [1]:
import numpy as np
import random
from collections import defaultdict
from pettingzoo.classic import texas_holdem_v4

In [2]:
# Create Texas Hold'em environment
env = texas_holdem_v4.env(num_players=6)


In [3]:
def policy_epsilon_greedy(Q, state, actions, epsilon=0.1):
    state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Explore
    return max(actions, key=lambda a: Q[state][a])  # Exploit

In [4]:
# Monte Carlo with Exploring Starts
class MCAgent:
    def __init__(self, epsilon=0.1, gamma=1.0):
        self.Q = defaultdict(lambda: defaultdict(float))  # Q[state][action] = value
        self.returns = defaultdict(lambda: defaultdict(list))  # Track returns
        self.epsilon = epsilon
        self.gamma = gamma  # Discount factor

    def update_policy(self, episode):
        G = 0
        visited_states = set()
        for state, action, reward in reversed(episode):
            state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
            G = self.gamma * G + reward
            if (state, action) not in visited_states:
                self.returns[state][action].append(G)
                self.Q[state][action] = np.mean(self.returns[state][action])
                visited_states.add((state, action))

    def train(self, num_episodes=10000):
        for episode_num in range(num_episodes):
            env.reset()
            done = False
            episode = []
            for agent in env.agent_iter():
                state, reward, termination, truncation, _ = env.last()
                state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
                done = termination or truncation
                available_actions = list(range(env.action_space(agent).n))
                action = policy_epsilon_greedy(self.Q, state, available_actions, self.epsilon) if not done else None
                env.step(action)
                episode.append((state, action, reward))
                if done:
                    break
            self.update_policy(episode)

    def play(self):
        env.reset()
        done = False
        for agent in env.agent_iter():
            state, _, termination, truncation, _ = env.last()
            state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
            done = termination or truncation
            if done:
                break
            action = max(self.Q[state], key=self.Q[state].get, default=random.choice(range(env.action_space(agent).n)))
            env.step(action)

In [5]:
agent = MCAgent()

In [6]:
agent.train(10000)

TypeError: unhashable type: 'numpy.ndarray'