# Eval Seeds Investigation

Okay this is a bit of a sidetrack but we should have some sense of how evaluation sample sizes affect the results. In particular, these are the variables:
  1. Which program. We'll generate $P$ weighted random programs from the PCFG
  2. How many different values to seed the fallback policy with, $Q$
  3. For each fallback policy, how many different environment seeds to use, $R$

In [1]:
from typing import Iterable
from multiprocessing import Pool

import numpy as np
from tqdm import tqdm

from nsai_experiments.zoning_game.zg_cfg import ZONING_GAME_GRAMMAR, generate_one_probabilistic, format_ruleset
from eval_seeds_investigation_helper import _my_evaluate_policies_for_seed
from nsai_experiments.zoning_game.zg_policy import create_policy_indiv_greedy
from nsai_experiments.zoning_game.zg_gym import ZoningGameEnv

In [2]:
def my_evaluate_ruleset(ruleset, policy_seeds, env_seeds):
    env = ZoningGameEnv()

    env_seeds_is_2d = isinstance(env_seeds[0], Iterable)
    if env_seeds_is_2d:
        if not len(env_seeds) == len(policy_seeds):
            raise ValueError("If env_seeds is 2D, its length must match that of policy_seeds")

    with Pool() as pool:
        results = pool.starmap(
            _my_evaluate_policies_for_seed,
            [(policy_seed, ruleset, create_policy_indiv_greedy, env_seeds[i] if env_seeds_is_2d else env_seeds, env, None)
             for (i, policy_seed) in enumerate(policy_seeds)]
        )
    
    flat_scores = [score for sublist in results for score in sublist[0]]
    return np.array(flat_scores, dtype=np.float32)


In [3]:
def simple_evaluate_rulesets(programs, seed, n):
    policy_seeds = list(range(seed, seed + n))
    env_seeds = list(range(seed + 1000, seed + 1000 + n))

    all_scores = []
    for program in tqdm(programs):
        scores = my_evaluate_ruleset(program, policy_seeds, env_seeds)
        all_scores.append(scores)
    return all_scores

In [4]:
n_programs = 30
pbase = 10_000
n_sqrt_per_program = 30
result_seed = 47

programs = [format_ruleset(generate_one_probabilistic(ZONING_GAME_GRAMMAR, seed=pbase + i)) for i in range(n_programs)]
programs[3]

'COMMERCIAL must be_within 2 tiles_of COMMERCIAL ;\nDOWNTOWN must ( be_within 1 tiles_of PARK and ( ( form_fewer_than 2 separate_clusters and ( not be_within 5 tiles_of BOARD_HORIZONTAL_MEDIAN ) ) and ( not ( not form_cluster_with_fewer_than 5 tiles ) ) ) ) ;\nRESIDENTIAL must be_within 1 tiles_of BOARD_CENTER ;\n'

In [5]:
results = simple_evaluate_rulesets(programs, result_seed, n_sqrt_per_program)

100%|██████████| 30/30 [03:26<00:00,  6.88s/it]


In [6]:
sem_ns = [1, 3, 5, 10, 25, 50, 75, 100, 200, 400, 900]

for i, scores in enumerate(results):
    print(f"Program {i}: mean = {np.mean(scores):.3f}, std = {np.std(scores):.3f}, n = {len(scores)}")
    sems = [np.std(scores, ddof=1) / np.sqrt(sem_n) for sem_n in sem_ns]
    print("\tSEMs:", " ".join(f"{sem_n}: {sem:.2f}" for sem_n, sem in zip(sem_ns, sems)))

avg_sems = np.mean([[np.std(scores, ddof=1) / np.sqrt(sem_n) for sem_n in sem_ns] for scores in results], axis=0)
print("\nAverage SEMs across all programs:\n\t", " ".join(f"{sem_n}: {sem:.2f}" for sem_n, sem in zip(sem_ns, avg_sems)))

avg_sems_pct = np.mean([[(np.std(scores, ddof=1) / np.sqrt(sem_n)) / np.mean(scores) for sem_n in sem_ns] for scores in results], axis=0)
print("Average SEMs as percentage of mean across all programs:\n\t", " ".join(f"{sem_n}: {sem_pct:.2%}" for sem_n, sem_pct in zip(sem_ns, avg_sems_pct)))

Program 0: mean = 61.999, std = 27.141, n = 900
	SEMs: 1: 27.16 3: 15.68 5: 12.14 10: 8.59 25: 5.43 50: 3.84 75: 3.14 100: 2.72 200: 1.92 400: 1.36 900: 0.91
Program 1: mean = 64.790, std = 26.695, n = 900
	SEMs: 1: 26.71 3: 15.42 5: 11.94 10: 8.45 25: 5.34 50: 3.78 75: 3.08 100: 2.67 200: 1.89 400: 1.34 900: 0.89
Program 2: mean = 59.811, std = 26.779, n = 900
	SEMs: 1: 26.79 3: 15.47 5: 11.98 10: 8.47 25: 5.36 50: 3.79 75: 3.09 100: 2.68 200: 1.89 400: 1.34 900: 0.89
Program 3: mean = 60.127, std = 27.093, n = 900
	SEMs: 1: 27.11 3: 15.65 5: 12.12 10: 8.57 25: 5.42 50: 3.83 75: 3.13 100: 2.71 200: 1.92 400: 1.36 900: 0.90
Program 4: mean = 65.519, std = 26.929, n = 900
	SEMs: 1: 26.94 3: 15.56 5: 12.05 10: 8.52 25: 5.39 50: 3.81 75: 3.11 100: 2.69 200: 1.91 400: 1.35 900: 0.90
Program 5: mean = 64.282, std = 26.577, n = 900
	SEMs: 1: 26.59 3: 15.35 5: 11.89 10: 8.41 25: 5.32 50: 3.76 75: 3.07 100: 2.66 200: 1.88 400: 1.33 900: 0.89
Program 6: mean = 65.519, std = 26.929, n = 900
	SEM