# 导入必要库

In [1]:
from pathlib import Path
import time

from core import RLEnv
from core import BaseAgent
from core.schemas import PiecewizeMethod  # noqa
from greedy import EpsilonDecreasingConfig, GreedyAgent, GreedyAlgorithm, GreedyType
from ucb1 import UCBAgent, UCB1Algorithm
from thompson_sampling import TSAgent, TSAlgorithm

from train import batch_train
from utils import plot_metrics_history, save_experiment_data, ProcessDataLogger, clear_var

In [2]:
STEPS: int = 100_000
GRID_SIZE: int = 500

SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 5
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"
CONSTANT_STEPSIZE: float = 0
DISCOUNT_FACTOR: float = 0

ENV: RLEnv = RLEnv(
    machine_count=MACHINE_COUNT,
    # random_walk_internal=1,
    # random_walk_machine_num=1,
    # piecewise_internal=1,
    # piecewize_method=PiecewizeMethod.UPSIDE_DOWN,
    seed=SEED,
)
EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [3]:
def get_run_id(agent_name: str) -> str:
    return agent_name + "_" + str(time.time())

In [4]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=GreedyType.GREEDY,
        env=env,
        algorithm=GreedyAlgorithm(
            greedy_type=GreedyType.GREEDY,
            optimistic_init=optimistic_init,
            optimistic_times=optimistic_times,
        ),
        epsilon_config=epsilon_config,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [5]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name="UCB1",
        env=env,
        algorithm=UCB1Algorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [6]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    discount_factor: float,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name="Thompson Sampling",
        env=env,
        algorithm=TSAlgorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        discount_factor=discount_factor,
        seed=seed,
    )

# 训练

## 普通贪婪算法

In [7]:
run_id = get_run_id(GreedyType.GREEDY)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=env,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "贪婪算法", file_name, x_log=False)
plot_metrics_history(agents, "贪婪算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 310
达到收敛时的步数: 280
达到收敛时的步数: 300
avg_regret=3688.6909090909116 avg_regret_rate=0.04057560000000003 avg_total_reward=87220.4 avg_optimal_rate=0.599844 avg_convergence_steps=178.0 avg_convergence_rate=0.6
values=[0.6, 4.4, 17.8, 0.8, 0.6, 0.8, 0.4, 0.8, 54495.4, 32698.8] counts=[1.6, 6.4, 23.8, 1.8, 1.6, 1.8, 1.4, 1.8, 59984.4, 39975.4]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/greedy_1758175778.051391_T=100000_K=10_Q_0=1.png
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/greedy_1758175778.051391_T=100000_K=10_Q_0=1_x_log.png
✅ 实验结果数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/greedy_1758175778.051391_T=100000_K=10_Q_0=1.json
✅ 过程数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/greedy_175817577

## UCB1算法

In [8]:
run_id = get_run_id("ucb1")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "UCB1 算法", file_name, x_log=False)
plot_metrics_history(agents, "UCB1 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

avg_regret=72681.09090909091 avg_regret_rate=0.7994920000000001 avg_total_reward=18228.0 avg_optimal_rate=0.0 avg_convergence_steps=0.0 avg_convergence_rate=0.0
values=[18228.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] counts=[100000.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/ucb1_1758175780.738071_T=100000_K=10_Q_0=1.png
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/ucb1_1758175780.738071_T=100000_K=10_Q_0=1_x_log.png
✅ 实验结果数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/ucb1_1758175780.738071_T=100000_K=10_Q_0=1.json
✅ 过程数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/ucb1_1758175780.738071_T=100000_K=10_Q_0=1process.json
909.090909090909
0.18181818

# Thompson Sampling 算法

In [9]:
run_id = get_run_id("thompson_sampling")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    discount_factor=DISCOUNT_FACTOR,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "TS 算法", file_name, x_log=False)
plot_metrics_history(agents, "TS 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 1810
达到收敛时的步数: 1550
达到收敛时的步数: 960
达到收敛时的步数: 1110
达到收敛时的步数: 730
avg_regret=78.69090909091173 avg_regret_rate=0.0008656000000000291 avg_total_reward=90830.4 avg_optimal_rate=0.9967940000000001 avg_convergence_steps=1232.0 avg_convergence_rate=1.0
values=[1.0, 26.2, 34.0, 3.4, 1.8, 8.4, 0.6, 10.0, 90602.6, 142.4] counts=[6.0, 38.4, 46.4, 9.4, 7.2, 16.6, 6.0, 17.2, 99679.4, 173.4]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/thompson_sampling_1758175783.2109652_T=100000_K=10_Q_0=1.png
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/thompson_sampling_1758175783.2109652_T=100000_K=10_Q_0=1_x_log.png
✅ 实验结果数据已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/thompson_sampling_1758175783.2109652_T=100000_K=10_Q_0=1.json
✅ 过程数据已保存至 /home/