In [8]:
import rlcard
from rlcard.envs import make
from rlcard.games.nolimitholdem import Judger
from rlcard.games.limitholdem.utils import Hand
import collections
import numpy as np
import random

In [9]:
class PokerRLAgent:
    def __init__(self):
        self.env = make('no-limit-holdem', config={'game_num_players': 2})
        self.state, self.player_id = self.env.reset()
        self.player_id = 0
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(
            collections.Counter)
        self.values = collections.defaultdict(float)

In [10]:
# resolves unhashable objects to hashable ones for rewards, transits and values

def make_hashable(obj):
    if isinstance(obj, list):
        return tuple(make_hashable(item) for item in obj)
    elif isinstance(obj, dict):
        return frozenset((make_hashable(k), make_hashable(v)) for k, v in obj.items())
    elif isinstance(obj, np.ndarray):
        return tuple(obj.flatten())  # or obj.tobytes() if you want strict matching
    else:
        return obj

def unhash(obj):
    if isinstance(obj, tuple):
        # Heuristic: if it looks like a flattened array, turn it into one
        if all(isinstance(x, (int, float)) for x in obj):
            return np.array(obj)  # interpret as numpy array
        else:
            return [unhash(x) for x in obj]  # interpret as list
    elif isinstance(obj, frozenset):
        return {unhash(k): unhash(v) for k, v in obj}
    else:
        return obj 

In [11]:
agent=PokerRLAgent()
env = make('no-limit-holdem', config={'game_num_players': 2})

In [12]:
state, player_id = env.reset()
done = False
sequence = []
currentplayer = agent.state[0]['raw_obs']['current_player']

while not done:

    if state['raw_obs']['current_player'] == currentplayer:
        
        oldstate = state['obs']
        action = random.choice(state['raw_legal_actions'])
        state, player_id = env.step(action)
        sequence.append([oldstate, action]) 

    else:   

        action = random.choice(state['raw_legal_actions'])
        state, player_id = env.step(action)

    done = env.is_over()

payoffs = env.get_payoffs()
key = make_hashable(sequence)
agent.rewards[key] = payoffs[currentplayer]


In [14]:
def compute_q_values_from_trajectory(trajectory, gamma=0.99):
    """
    trajectory = [(state, action, reward), ...]
    returns: list of (state, action, q_value)
    """
    q_values = []
    G = 0
    for t in reversed(range(len(trajectory))):
        state, action, reward = trajectory[t]
        G = reward + gamma * G
        q_values.insert(0, (state, action, G))  # prepend to keep order
    return q_values

In [16]:
trajectory = [
    ("s0", "a0", -20.0),
    ("s1", "a1", -20.0),
    ("s2", "a2", 40.0),
]

compute_q_values_from_trajectory(trajectory, gamma=1.0)

[('s0', 'a0', 0.0), ('s1', 'a1', 20.0), ('s2', 'a2', 40.0)]