# 导入必要库

In [1]:
from pathlib import Path
from datetime import datetime, timezone
import os
import time

from core import RLEnv
from core import BaseAgent
from core.schemas import PiecewizeMethod  # noqa
from greedy import EpsilonDecreasingConfig, GreedyAgent, GreedyAlgorithm, GreedyType
from ucb1 import UCBAgent, UCB1Algorithm
from thompson_sampling import TSAgent, TSAlgorithm
from shemas import DynamicMethod

from train import batch_train
from utils import (
    plot_metrics_history,
    save_experiment_data,
    ProcessDataLogger,
    clear_var,
    ExperimentMeta,
)

In [2]:
# 常量参数
STEPS: int = 100_000
GRID_SIZE: int = 500
NUM_WORKERS: int = 10
SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 500
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"
DATE = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

# 收敛参数
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100

# 乐观初始化
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True

# Agent 对动态环境的适应参数
CONSTANT_STEPSIZE: float = 0.999
UCB_CONSTANT_STEPSIZE: float = 0.999
DISCOUNT_FACTOR: float = 0.999

# 动态环境配置参数
ENABLE_DYNAMIC: bool = True
RANDOM_WALK_INTERNAL: int = 1
RANDOM_WALK_MACHINE_NUM: int = 1
PIECEWISE_INTERNAL: int = 10_000
PIECEWISE_METHOD: PiecewizeMethod = PiecewizeMethod.UPSIDE_DOWN
DYNAMIC_METHOD: DynamicMethod = DynamicMethod.PIECEWISE

# 动态环境生成
if not ENABLE_DYNAMIC:
    ENV: RLEnv = RLEnv(
        machine_count=MACHINE_COUNT,
        seed=SEED,
    )
elif DYNAMIC_METHOD == DynamicMethod.PIECEWISE:
    ENV: RLEnv = RLEnv(
        machine_count=MACHINE_COUNT,
        enable_dynamic=ENABLE_DYNAMIC,
        piecewise_internal=PIECEWISE_INTERNAL,
        piecewize_method=PIECEWISE_METHOD,
        seed=SEED,
    )
elif DYNAMIC_METHOD == DynamicMethod.RANDOM_WALK:
    ENV: RLEnv = RLEnv(
        machine_count=MACHINE_COUNT,
        enable_dynamic=ENABLE_DYNAMIC,
        random_walk_internal=RANDOM_WALK_INTERNAL,
        random_walk_machine_num=RANDOM_WALK_MACHINE_NUM,
        seed=SEED,
    )
else:
    raise ValueError(f"Invalid dynamic method: {DYNAMIC_METHOD}")

EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [3]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=GreedyType.GREEDY,
        env=env,
        algorithm=GreedyAlgorithm(
            greedy_type=GreedyType.GREEDY,
            optimistic_init=optimistic_init,
            optimistic_times=optimistic_times,
        ),
        epsilon_config=epsilon_config,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [4]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name="UCB1",
        env=env,
        algorithm=UCB1Algorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [5]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    discount_factor: float,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name="Thompson Sampling",
        env=env,
        algorithm=TSAlgorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        discount_factor=discount_factor,
        seed=seed,
    )

In [6]:
def create_meta(
    run_id: str,
    agent_algorithm: str,
    is_dynamic: bool,
) -> ExperimentMeta:
    meta = ExperimentMeta(
        run_id=run_id,
        run_date=DATE,
        total_steps=STEPS,
        num_arms=MACHINE_COUNT,
        agent_seed=SEED,
        agent_algorithm=agent_algorithm,
        agent_runs=RUN_COUNT,
        convergence_threshold=CONVERGENCE_THRESHOLD,
        optimistic_initialization_enabled=ENABLE_OPTIMISTIC,
        optimistic_initialization_value=OPTIMISTIC_TIMES,
        environment_type="dynamic" if is_dynamic else "static",
        environment_dynamic_method=DYNAMIC_METHOD.value,
        piecewise_stationary_interval=PIECEWISE_INTERNAL,
        piecewise_stationary_method=PIECEWISE_METHOD.value,
        environment_seed=SEED,
        min_convergence_steps=CONVERGENCE_MIN_STEPS,
        constant_alpha=CONSTANT_STEPSIZE,
        discount_factor=DISCOUNT_FACTOR,
        random_walk_interval=RANDOM_WALK_INTERNAL,
        random_walk_affected_arms=RANDOM_WALK_MACHINE_NUM,
    )
    return meta

# 训练

## 普通贪婪算法

In [7]:
run_id = GreedyType.GREEDY.value + "_" + str(time.time()).split(".")[0]
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, GreedyType.GREEDY, is_dynamic=ENABLE_DYNAMIC)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=env,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents, f"贪婪算法 CONSTANT_STEPSIZE= {CONSTANT_STEPSIZE}", file_name, x_log=False
)
plot_metrics_history(
    agents, f"贪婪算法 CONSTANT_STEPSIZE= {CONSTANT_STEPSIZE}", file_name, x_log=True
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem("process" + "_" + file_name.stem), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 160达到收敛时的步数: 170达到收敛时的步数: 110


达到收敛时的步数: 370达到收敛时的步数: 200

达到收敛时的步数: 450
达到收敛时的步数: 620
达到收敛时的步数: 1190
达到收敛时的步数: 770
达到收敛时的步数: 6030
达到收敛时的步数: 6020
达到收敛时的步数: 590
达到收敛时的步数: 1330
达到收敛时的步数: 1780
达到收敛时的步数: 170
达到收敛时的步数: 2350
达到收敛时的步数: 300
达到收敛时的步数: 3600
达到收敛时的步数: 3990
达到收敛时的步数: 3670
达到收敛时的步数: 130
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 4960
达到收敛时的步数: 2390
达到收敛时的步数: 8400
达到收敛时的步数: 130
达到收敛时的步数: 5920
达到收敛时的步数: 630
达到收敛时的步数: 580
达到收敛时的步数: 9380
达到收敛时的步数: 2230
达到收敛时的步数: 1580
达到收敛时的步数: 1820
达到收敛时的步数: 340
达到收敛时的步数: 3060
达到收敛时的步数: 360
达到收敛时的步数: 3300
达到收敛时的步数: 1250
达到收敛时的步数: 290
达到收敛时的步数: 100
达到收敛时的步数: 710
达到收敛时的步数: 2000
达到收敛时的步数: 1080
达到收敛时的步数: 560
达到收敛时的步数: 250
达到收敛时的步数: 3410
达到收敛时的步数: 380
达到收敛时的步数: 5060
达到收敛时的步数: 100
达到收敛时的步数: 1240
达到收敛时的步数: 2990
达到收敛时的步数: 100
达到收敛时的步数: 290
达到收敛时的步数: 100
达到收敛时的步数: 2520
达到收敛时的步数: 4290
达到收敛时的步数: 480
达到收敛时的步数: 170
达到收敛时的步数: 2950
达到收敛时的步数: 1530
达到收敛时的步数: 210
达到收敛时的步数: 850
达到收敛时的步数: 760
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 380
达到收敛时的步数: 430
达到收敛时的步数: 3420
达到收

## UCB1算法

In [8]:
from ucb1.schemas import UCB1AlgorithmType


run_id = UCB1AlgorithmType.UCB1.value + "_" + str(time.time()).split(".")[0]
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, UCB1AlgorithmType.UCB1, is_dynamic=ENABLE_DYNAMIC)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=UCB_CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents,
    f"UCB1 算法 CONSTANT_STEPSIZE= {UCB_CONSTANT_STEPSIZE}",
    file_name,
    x_log=False,
)
plot_metrics_history(
    agents,
    f"UCB1 算法 CONSTANT_STEPSIZE= {UCB_CONSTANT_STEPSIZE}",
    file_name,
    x_log=True,
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem("process" + "_" + file_name.stem), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

avg_regret=43543.358 avg_regret_rate=0.4838150888888889 avg_total_reward=46456.642 avg_optimal_rate=0.09984514 avg_convergence_steps=0.0 avg_convergence_rate=0.0
values=[4653.576, 4652.404, 4640.136, 4646.236, 4659.766, 4637.578, 4636.938, 4644.48, 4643.682, 4641.846] counts=[9999.596, 10003.122, 9997.946, 10000.266, 10002.854, 9998.058, 9996.834, 10000.98, 10000.494, 9999.85]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250925_125315/ucb1_1758804828.png
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250925_125315/ucb1_1758804828_x_log.png
✅ 实验结果数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250925_125315/ucb1_1758804828.json
✅ 过程数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/20250925_125315/proce

## Thompson Sampling 算法

In [9]:
from thompson_sampling.schemas import TSAlgorithmType


run_id = TSAlgorithmType.TS.value + "_" + str(time.time()).split(".")[0]
file_name: Path = EXPERIMENT_DATA_DIR / DATE / f"{run_id}.png"
if not file_name.exists():
    os.makedirs(file_name.parent, exist_ok=True)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()
meta = create_meta(run_id, TSAlgorithmType.TS, is_dynamic=ENABLE_DYNAMIC)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    discount_factor=DISCOUNT_FACTOR,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(
    agents, f"TS 算法 DISCOUNT_FACTOR= {DISCOUNT_FACTOR}", file_name, x_log=False
)
plot_metrics_history(
    agents, f"TS 算法 DISCOUNT_FACTOR= {DISCOUNT_FACTOR}", file_name, x_log=True
)
save_experiment_data(reward, metrics, meta, file_name)
process_logger.save(file_name.with_stem("process" + "_" + file_name.stem), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 250
达到收敛时的步数: 720
达到收敛时的步数: 850
达到收敛时的步数: 840
达到收敛时的步数: 890
达到收敛时的步数: 1400达到收敛时的步数: 1860

达到收敛时的步数: 2180
达到收敛时的步数: 2360
达到收敛时的步数: 3970
达到收敛时的步数: 1000
达到收敛时的步数: 2270
达到收敛时的步数: 1290
达到收敛时的步数: 1770
达到收敛时的步数: 3070
达到收敛时的步数: 600
达到收敛时的步数: 640
达到收敛时的步数: 550
达到收敛时的步数: 960
达到收敛时的步数: 1020
达到收敛时的步数: 1220
达到收敛时的步数: 550
达到收敛时的步数: 440
达到收敛时的步数: 2370
达到收敛时的步数: 1260
达到收敛时的步数: 3290
达到收敛时的步数: 4940
达到收敛时的步数: 2720
达到收敛时的步数: 1460
达到收敛时的步数: 410
达到收敛时的步数: 790
达到收敛时的步数: 1570
达到收敛时的步数: 2000
达到收敛时的步数: 1970
达到收敛时的步数: 4300
达到收敛时的步数: 610
达到收敛时的步数: 2570
达到收敛时的步数: 970
达到收敛时的步数: 2180
达到收敛时的步数: 740
达到收敛时的步数: 1090
达到收敛时的步数: 1930
达到收敛时的步数: 2110
达到收敛时的步数: 1350
达到收敛时的步数: 880
达到收敛时的步数: 220
达到收敛时的步数: 1860
达到收敛时的步数: 1370
达到收敛时的步数: 1110
达到收敛时的步数: 940
达到收敛时的步数: 1450
达到收敛时的步数: 610
达到收敛时的步数: 1050
达到收敛时的步数: 3220
达到收敛时的步数: 1600
达到收敛时的步数: 2160
达到收敛时的步数: 2320
达到收敛时的步数: 220
达到收敛时的步数: 560
达到收敛时的步数: 220
达到收敛时的步数: 4180
达到收敛时的步数: 2580
达到收敛时的步数: 410
达到收敛时的步数: 990
达到收敛时的步数: 3590
达到收敛时的步数: 860
达到收敛时的步数: 200
达到收敛时的步数: 1420
达到收敛时的步