# 导入必要库

In [1]:
from pathlib import Path
from datetime import datetime, timezone
import os

from core import RLEnv
from core import BaseAgent
from core.schemas import PiecewizeMethod  # noqa
from greedy import EpsilonDecreasingConfig, GreedyAgent, GreedyAlgorithm, GreedyType
from ucb1 import UCBAgent, UCB1Algorithm
from thompson_sampling import TSAgent, TSAlgorithm
from shemas import DynamicMethod

from train import batch_train
from utils import (
    plot_metrics_history,
    save_experiment_data,
    ProcessDataLogger,
    clear_var,
    ExperimentMeta,
)

In [2]:
STEPS: int = 100_000
GRID_SIZE: int = 1000
NUM_WORKERS: int = 10

SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 500
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"
CONSTANT_STEPSIZE: float = 0.1
UCB_CONSTANT_STEPSIZE: float = 0.1
DISCOUNT_FACTOR: float = 0.1
DATE = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
RANDOM_WALK_INTERNAL: int = 1
RANDOM_WALK_MACHINE_NUM: int = 1
PIECEWISE_INTERNAL: int = 10_000
PIECEWISE_METHOD: PiecewizeMethod = PiecewizeMethod.UPSIDE_DOWN
DYNAMIC_METHOD: DynamicMethod = DynamicMethod.PIECEWISE

if DYNAMIC_METHOD == DynamicMethod.PIECEWISE:
    ENV: RLEnv = RLEnv(
        machine_count=MACHINE_COUNT,
        piecewise_internal=PIECEWISE_INTERNAL,
        piecewize_method=PIECEWISE_METHOD,
        seed=SEED,
    )
elif DYNAMIC_METHOD == DynamicMethod.RANDOM_WALK:
    ENV: RLEnv = RLEnv(
        machine_count=MACHINE_COUNT,
        random_walk_internal=RANDOM_WALK_INTERNAL,
        random_walk_machine_num=RANDOM_WALK_MACHINE_NUM,
        seed=SEED,
    )
else:
    raise ValueError(f"Invalid dynamic method: {DYNAMIC_METHOD}")

EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [3]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=GreedyType.GREEDY,
        env=env,
        algorithm=GreedyAlgorithm(
            greedy_type=GreedyType.GREEDY,
            optimistic_init=optimistic_init,
            optimistic_times=optimistic_times,
        ),
        epsilon_config=epsilon_config,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [4]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name="UCB1",
        env=env,
        algorithm=UCB1Algorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [5]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    discount_factor: float,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name="Thompson Sampling",
        env=env,
        algorithm=TSAlgorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        discount_factor=discount_factor,
        seed=seed,
    )

In [6]:
def create_meta(
    run_id: str,
    agent_algorithm: str,
    is_static: bool,
) -> ExperimentMeta:
    meta = ExperimentMeta(
        run_id=run_id,
        run_date=DATE,
        total_steps=STEPS,
        num_arms=MACHINE_COUNT,
        agent_seed=SEED,
        agent_algorithm=agent_algorithm,
        agent_runs=RUN_COUNT,
        convergence_threshold=CONVERGENCE_THRESHOLD,
        optimistic_initialization_enabled=ENABLE_OPTIMISTIC,
        optimistic_initialization_value=OPTIMISTIC_TIMES,
        environment_type="static" if is_static else "dynamic",
        environment_dynamic_method=DYNAMIC_METHOD.value,
        piecewise_stationary_interval=PIECEWISE_INTERNAL,
        piecewise_stationary_method=PIECEWISE_METHOD.value,
        environment_seed=SEED,
        min_convergence_steps=CONVERGENCE_MIN_STEPS,
        constant_alpha=CONSTANT_STEPSIZE,
        discount_factor=DISCOUNT_FACTOR,
        random_walk_interval=RANDOM_WALK_INTERNAL,
        random_walk_affected_arms=RANDOM_WALK_MACHINE_NUM,
    )
    return meta

# 训练

## 普通贪婪算法

In [7]:
run_id = GreedyType.GREEDY.value
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, GreedyType.GREEDY, is_static=True)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=env,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents, f"贪婪算法 CONSTANT_STEPSIZE= {CONSTANT_STEPSIZE}", file_name, x_log=False
)
plot_metrics_history(
    agents, f"贪婪算法 CONSTANT_STEPSIZE= {CONSTANT_STEPSIZE}", file_name, x_log=True
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 100
达到收敛时的步数: 100达到收敛时的步数: 100达到收敛时的步数: 130达到收敛时的步数: 100



达到收敛时的步数: 100
达到收敛时的步数: 170
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 120
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 120
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 140
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 120
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 120
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 10000
达到收敛时的步数: 140
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛

## UCB1算法

In [8]:
from ucb1.schemas import UCB1AlgorithmType


run_id = UCB1AlgorithmType.UCB1.value
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, UCB1AlgorithmType.UCB1, is_static=True)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=UCB_CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents,
    f"UCB1 算法 CONSTANT_STEPSIZE= {UCB_CONSTANT_STEPSIZE}",
    file_name,
    x_log=False,
)
plot_metrics_history(
    agents,
    f"UCB1 算法 CONSTANT_STEPSIZE= {UCB_CONSTANT_STEPSIZE}",
    file_name,
    x_log=True,
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

avg_regret=5511.868 avg_regret_rate=0.06124297777777778 avg_total_reward=84488.132 avg_optimal_rate=0.32154397999999995 avg_convergence_steps=0.0 avg_convergence_rate=0.0
values=[8835.968, 8348.072, 8310.656, 8324.872, 9234.76, 8262.506, 8156.488, 8236.7, 8388.61, 8389.5] counts=[10465.592, 9847.68, 9845.132, 9883.738, 10849.308, 9808.538, 9685.046, 9720.996, 9935.638, 9958.332]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250924_023658/ucb1.png
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250924_023658/ucb1_x_log.png
✅ 实验结果数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250924_023658/ucb1.json
✅ 过程数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250924_023658/ucb1process.json
909.090909090909
0.

## Thompson Sampling 算法

In [9]:
from thompson_sampling.schemas import TSAlgorithmType


run_id = TSAlgorithmType.TS.value
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, TSAlgorithmType.TS, is_static=True)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    discount_factor=DISCOUNT_FACTOR,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents, f"TS 算法 DISCOUNT_FACTOR= {DISCOUNT_FACTOR}", file_name, x_log=False
)
plot_metrics_history(
    agents, f"TS 算法 DISCOUNT_FACTOR= {DISCOUNT_FACTOR}", file_name, x_log=True
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 2560
达到收敛时的步数: 4110
达到收敛时的步数: 6940
达到收敛时的步数: 5010
达到收敛时的步数: 1450
达到收敛时的步数: 8390
达到收敛时的步数: 3990
达到收敛时的步数: 2550
达到收敛时的步数: 4680
达到收敛时的步数: 3580
达到收敛时的步数: 5640
达到收敛时的步数: 3130
达到收敛时的步数: 2770
达到收敛时的步数: 5240
达到收敛时的步数: 6530
达到收敛时的步数: 6100
达到收敛时的步数: 6460
达到收敛时的步数: 1620
达到收敛时的步数: 880
达到收敛时的步数: 760
达到收敛时的步数: 6370
达到收敛时的步数: 1620
达到收敛时的步数: 1330
达到收敛时的步数: 1070
达到收敛时的步数: 4810
达到收敛时的步数: 9210
达到收敛时的步数: 4800
达到收敛时的步数: 4210
达到收敛时的步数: 8520
达到收敛时的步数: 3830
达到收敛时的步数: 2190
达到收敛时的步数: 7430
达到收敛时的步数: 1740
达到收敛时的步数: 2350
达到收敛时的步数: 2380
达到收敛时的步数: 1390
达到收敛时的步数: 9220
达到收敛时的步数: 6260
达到收敛时的步数: 5240
达到收敛时的步数: 4700
达到收敛时的步数: 1130


ValueError: a <= 0