In [1]:
import os, sys

# find repo root (looks for liars_poker/ or pyproject.toml)
def find_repo_root(start_dir: str) -> str:
    cur = os.path.abspath(start_dir)
    for _ in range(6):
        if os.path.isdir(os.path.join(cur, "liars_poker")) or os.path.exists(os.path.join(cur, "pyproject.toml")):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            break
        cur = parent
    return os.path.abspath(os.path.join(start_dir, "..", ".."))

NB_DIR = os.getcwd()
REPO_ROOT = find_repo_root(NB_DIR)
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

ARTIFACTS_ROOT = os.path.join(REPO_ROOT, "artifacts")
os.makedirs(ARTIFACTS_ROOT, exist_ok=True)

print("repo root   :", REPO_ROOT)
print("artifacts   :", ARTIFACTS_ROOT)


repo root   : /root/liars_poker
artifacts   : /root/liars_poker/artifacts


In [2]:
import random
from pprint import pprint

from liars_poker import (
    GameSpec, Env, InfoSet, Rules,
    best_response_mc,
    Policy, TabularPolicy, CommitOnceMixture, RandomPolicy,
    eval_both_seats
)

from liars_poker.algo.br_mc import efficient_best_response_mc_v3, efficient_best_response_mc_v2
from typing import List, Tuple

SEED = 42
random.seed(SEED)

# small game; P1 always starts by design
spec = GameSpec(ranks=4, suits=2, hand_size=2, claim_kinds=("RankHigh", "Pair"), suit_symmetry=True)
rules = Rules(spec)


In [3]:
def flatten_commit_once(policy: Policy) -> List[Tuple[Policy, float]]:
    if isinstance(policy, CommitOnceMixture):
        return list(zip(policy.policies, policy.weights))
    return [(policy, 1.0)]

def mix_policies(base_policy: Policy, br_policy: Policy, eta: float, rng: random.Random | None = None) -> CommitOnceMixture:
    base_components = flatten_commit_once(base_policy)
    br_components = flatten_commit_once(br_policy)

    combined_policies: List[Policy] = []
    combined_weights: List[float] = []

    for policy, weight in base_components:
        scaled = (1.0 - eta) * weight
        combined_policies.append(policy)
        combined_weights.append(scaled)

    for policy, weight in br_components:
        scaled = eta * weight
        combined_policies.append(policy)
        combined_weights.append(scaled)

    mixed_policy = CommitOnceMixture(combined_policies, combined_weights, rng=rng)
    mixed_policy.bind_rules(base_policy._rules)

    return mixed_policy

In [4]:
a0 = RandomPolicy()
a0.bind_rules(rules=rules)

# a0.store_efficiently('/root/liars_poker/artifacts/runs/run_temp')

In [5]:
all_averages = [a0]
all_brs = []

curr_av = a0

In [6]:
# import time

# start = time.time()
# br1 = best_response_mc(spec, a0, episodes=10_000)
# end = time.time()
# print(f"best_response_mc took {end - start:.3f} seconds")
# print(eval_both_seats(spec, br1, a0, 1000, seed=0))

# start = time.time()
# br2 = efficient_best_response_mc_v2(spec, a0, episodes=10_000)
# end = time.time()
# print(f"efficient_best_response_mc_v2 took {end - start:.3f} seconds")
# print(eval_both_seats(spec, br2, a0, 1000, seed=0))

# start = time.time()
# br3 = efficient_best_response_mc_v3(spec, a0, episodes=10_000)
# end = time.time()
# print(f"efficient_best_response_mc_v3 took {end - start:.3f} seconds")
# print(eval_both_seats(spec, br3, a0, 1000, seed=0))





In [7]:
import math 

episodes = 100_000
last_exploitablity = 1

for i in range(200):
    print(i)
    # eta = min(math.sqrt(1/(i+2)), 0.5)
    eta = 1 / (i+2)

    if last_exploitablity < 0.65:
        episodes = int(episodes*1.5)
        episodes = min(episodes, 5_000_000)
    # elif last_exploitablity > 0.6:
    #     episodes = int(episodes*0.8)
    #     episodes = max(100, episodes)

    print(episodes)


    b_i = efficient_best_response_mc_v3(spec=spec, opponent=curr_av, episodes=episodes, epsilon=0.1, min_visits_per_action=0, annotate='none', seed=i)
    eval_results = eval_both_seats(spec, b_i, curr_av, episodes=10_000)
    # print(eval_results)

    # b_i = efficient_best_response_mc_v2(spec=spec, opponent=curr_av, episodes=episodes, epsilon=0.1, min_visits_per_action=0, annotate='none', seed=i)
    # eval_results = eval_both_seats(spec, b_i, curr_av, episodes=10_000)
    # print(eval_results)
    
    # b_i = best_response_mc(spec=spec, opponent=curr_av, episodes=episodes, epsilon=0.1, min_visits_per_action=0, annotate='none', seed=i)
    # eval_results = eval_both_seats(spec, b_i, curr_av, episodes=10_000)
    # print(eval_results)
    
    
    all_brs.append(b_i)


    last_exploitablity = eval_results['A'] / eval_results['total']
    print(last_exploitablity)
    print()


    curr_av = mix_policies(curr_av, b_i, eta)
    all_averages.append(curr_av)




0
100000


0.8056

1
100000
0.7749

2
100000
0.7293

3
100000
0.6994

4
100000
0.695

5
100000
0.6607

6
100000
0.6788

7
100000
0.653

8
100000
0.6523

9
100000
0.6342

10
150000
0.6281

11
225000
0.6375

12
337500
0.6389

13
506250
0.6449

14
759375
0.6328

15
1139062
0.6292

16
1708593
0.6308

17
2562889
0.6249

18
3844333
0.6162

19
5000000
0.6171

20
5000000
0.6083

21
5000000
0.602

22
5000000
0.5975

23
5000000
0.5936

24
5000000
0.5931

25
5000000
0.5781

26
5000000
0.5805

27
5000000
0.5781

28
5000000
0.5709

29
5000000
0.5762

30
5000000
0.5709

31
5000000
0.568

32
5000000
0.5636

33
5000000
0.563

34
5000000
0.5681

35
5000000
0.5638

36
5000000
0.5541

37
5000000
0.5503

38
5000000
0.549

39
5000000
0.5525

40
5000000
0.5481

41
5000000
0.5491

42
5000000
0.5455

43
5000000
0.5449

44
5000000
0.5441

45
5000000
0.5483

46
5000000
0.5496

47
5000000
0.5396

48
5000000


KeyboardInterrupt: 

In [None]:
iset = InfoSet(pid=1, hand=(2, ), history=(0, ))

print(curr_av.prob_dist_at_infoset(iset))




{-1: 0.08333333333333333, 1: 0.08333333333333333, 2: 0.08333333333333333, 3: 0.08333333333333333, 4: 0.08333333333333333, 5: 0.08333333333333333, 6: 0.08333333333333333, 7: 0.08333333333333333, 8: 0.08333333333333333, 9: 0.08333333333333333, 10: 0.08333333333333333, 11: 0.08333333333333333}


In [None]:
curr_av.store_efficiently('/root/liars_poker/artifacts/runs/run_temp_18')

In [15]:
curr_av.weights

[0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.02040816326530611,
 0.02040816326530611,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306097,
 0.02040816326530611,
 0.02040816326530611,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306103,
 0.020408163265306114,
 0.020408163265306097,
 0.020408163265306114,
 0.020408163265306103,
 0.020408163265306103,
 0.02040816326530612,
 0.020408163265306103,
 0.02040816326530611,
 0.020408163265306103,
 0.02040816326530611,
 0.020408163265306103,
 0.020408163265306103,
 0.020408163265306103,
 0.020408163265306103,
 0.02040816326530611,
 0.020408163265306114,
 0.020408163265306114,
 0.020408163265306103,
 0.020408163265306097,
 0.020408163265306114,
 0.02040816326530612,
 0.02040816326530612,
 0.02040816326530612,
 0.02040816326530611,


In [10]:
b_final = efficient_best_response_mc_v3(spec, curr_av, episodes=1_000_000)

In [16]:
eval_both_seats(spec, b_final, curr_av, episodes=10_000)

{'A': 5483, 'B': 4517, 'total': 10000}