In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
sns.set()

%matplotlib inline
%config InlineBackend.figure_format="retina"

In [27]:
np.set_printoptions(suppress=True)

Based on *An Introduction to Counterfactual Regret Minimization* (Neller, Lanctot)

- `0`: no reward
- `1`: positive reward
- `-1`: negative reward

In [36]:
# Actions
ROCK, PAPER, SCISSORS = 0, 1, 2
num_actions = 3

# Stategy
opp_strategy = [0.4, 0.3, 0.3]

# Normal form (Payoff)
payoff = np.array([
    [[ 0,  0], [-1,  0], [ 1, -1]],
    [[ 1, -1], [ 0,  0], [-1,  1]],
    [[-1,  1], [ 1, -1], [ 0,  0]],
])

def value(x, y):
    return payoff[x, y, 0]

In [34]:
# Examples
print("Rock vs Rock    :", value(ROCK, ROCK))
print("Paper vs Rock   :", value(PAPER, ROCK))
print("Scissors vs Rock:", value(SCISSORS, ROCK))

Rock vs Rock    : 0
Paper vs Rock   : 1
Scissors vs Rock: -1


In [3]:
def normalize(strategy):
    normalizing_sum = np.sum(strategy)
    if normalizing_sum > 0:
        return strategy / normalizing_sum
    else:
        return np.ones(strategy.shape[0]) / strategy.shape[0]

def get_strategy(regret_sum):
    # remove negative regrets
    strategy = np.maximum(regret_sum, 0)
    return normalize(strategy)

def avg_strategy(strategy_sum):
    return normalize(strategy)

def get_action(strategy):
    strategy = strategy / np.sum(strategy)
    return np.searchsorted(np.cumsum(strategy), random.random())

In [35]:
# Training one player
action_utility = np.zeros(num_actions, dtype=float)
regret_sum     = np.zeros(num_actions, dtype=float)
strategy_sum   = np.zeros(num_actions, dtype=float)
random.seed(None)
for _ in range(100_000):
    # Get regret-matched mixed-strategy actions
    strategy = get_strategy(regret_sum)
    strategy_sum += strategy
    
    # Play game
    my_action  = get_action(strategy)
    opp_action = get_action(opp_strategy)
    
    # Compute action utilities (for rock paper scissors)
    action_utility = payoff[:, opp_action, 0]
    
    # Accumulate action regrets
    regret_sum += action_utility - action_utility[my_action]
avg_strategy(regret_sum)

array([1., 0., 0.])