In [2]:
import numpy as np
import random
from collections import defaultdict
from pettingzoo.classic import texas_holdem_v4

In [3]:
# Create Texas Hold'em environment
env = texas_holdem_v4.env(num_players=6)


In [4]:
def policy_epsilon_greedy(Q, state, actions, epsilon=0.1):
    state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Explore
    return max(actions, key=lambda a: Q[state][a])  # Exploit

In [5]:
# Monte Carlo with Exploring Starts
class MCAgent:
    def __init__(self, epsilon=0.1, gamma=1.0):
        self.Q = defaultdict(lambda: defaultdict(float))  # Q[state][action] = value
        self.returns = defaultdict(lambda: defaultdict(list))  # Track returns
        self.epsilon = epsilon
        self.gamma = gamma  # Discount factor

    def update_policy(self, episode):
        G = 0
        visited_states = set()
        for state, action, reward in reversed(episode):
            state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
            G = self.gamma * G + reward
            if (state, action) not in visited_states:
                self.returns[state][action].append(G)
                self.Q[state][action] = np.mean(self.returns[state][action])
                visited_states.add((state, action))

    def train(self, num_episodes=10000):
        for episode_num in range(num_episodes):
            env.reset()
            done = False
            episode = []
            for agent in env.agent_iter():
                state, reward, termination, truncation, _ = env.last()
                state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
                done = termination or truncation
                available_actions = list(range(env.action_space(agent).n))
                action = policy_epsilon_greedy(self.Q, state, available_actions, self.epsilon) if not done else None
                env.step(action)
                episode.append((state, action, reward))
                if done:
                    break
            self.update_policy(episode)

    def play(self):
        env.reset()
        done = False
        for agent in env.agent_iter():
            state, _, termination, truncation, _ = env.last()
            state = tuple(state.items()) if isinstance(state, dict) else state  # Convert dict to tuple
            done = termination or truncation
            if done:
                break
            action = max(self.Q[state], key=self.Q[state].get, default=random.choice(range(env.action_space(agent).n)))
            env.step(action)

In [6]:
agent = MCAgent()

In [None]:
import numpy as np
import random

# Actions
actions = ["rock", "paper", "scissors"]
num_actions = len(actions)

# Q-table (3x3, initialized to 0)
Q_table = np.zeros((num_actions, num_actions))

# Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Reward function
def get_reward(agent_action, opponent_action):
    if agent_action == opponent_action:
        return 0  # Draw
    elif (agent_action - opponent_action) % 3 == 1:
        return 1  # Win
    else:
        return -1  # Lose

# Training loop
episodes = 1000
for _ in range(episodes):
    # Agent chooses action (exploration-exploitation)
    if random.random() < epsilon:
        agent_action = random.randint(0, 2)  # Explore
    else:
        agent_action = np.argmax(Q_table[:, 0])  # Exploit best known action

    # Opponent (random strategy)
    opponent_action = 2

    # Get reward
    reward = get_reward(agent_action, opponent_action)

    # Q-value update (Q-learning formula)
    best_future_q = np.max(Q_table[opponent_action])
    Q_table[agent_action, opponent_action] += alpha * (reward + gamma * np.max(Q_table[opponent_action]) - Q_table[agent_action, opponent_action])

# Test the trained agent
print("Final Q-table:\n", Q_table)

# Making a decision based on learned Q-table
best_action = np.argmax(Q_table[:, 0])
print(f"Best action learned: {actions[best_action]}")
Q_table[:,0]


Final Q-table:
 [[ 0.          0.          1.        ]
 [ 0.          0.         -0.97972444]
 [ 0.          0.          0.        ]]
Best action learned: rock


array([0., 0., 0.])