# 导入必要库

In [8]:
from pathlib import Path
import gc
import time

from core import RLEnv
from core.agent import BaseAgent
from core.schemas import PiecewizeMethod # noqa
from greedy import (
    EpsilonDecreasingConfig,
    GreedyAgent,
    greedy_average,
    epsilon_average,
    epsilon_decreasing_average,
)
from ucb1 import UCBAgent, ucb1
from thompson_sampling import TSAgent

from train import batch_train
from utils import plot_metrics_history, save_experiment_data, ProcessDataLogger

In [9]:
STEPS: int = 100_000
GRID_SIZE: int = 500

SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 500
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"

ENV: RLEnv = RLEnv(
    machine_count=MACHINE_COUNT,
    random_walk_internal=1,
    random_walk_machine_num=1,
    # piecewise_internal=1,
    # piecewize_method=PiecewizeMethod.UPSIDE_DOWN,
    seed=SEED
)
EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [10]:
def get_run_id(agent_name: str) -> str:
    return agent_name + "_" + str(time.time())

In [11]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=greedy_average.__name__,
        env=env,
        greedy_algorithm=greedy_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )


def create_epsilon_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=epsilon_average.__name__,
        env=env,
        greedy_algorithm=epsilon_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )


def create_decreasing_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=epsilon_decreasing_average.__name__,
        env=env,
        greedy_algorithm=epsilon_decreasing_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

In [12]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name=ucb1.__name__,
        env=env,
        ucb1_algorithm=ucb1,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

In [13]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name=TSAgent.__name__,
        env=env,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

# 训练

## 普通贪婪算法

In [14]:
run_id = get_run_id(greedy_average.__name__)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=ENV,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "贪婪算法", file_name, x_log=False)
plot_metrics_history(agents, "贪婪算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 290
达到收敛时的步数: 180
达到收敛时的步数: 420
达到收敛时的步数: 474
达到收敛时的步数: 942
达到收敛时的步数: 170
达到收敛时的步数: 300
达到收敛时的步数: 170
达到收敛时的步数: 485
达到收敛时的步数: 390
达到收敛时的步数: 230
达到收敛时的步数: 190
达到收敛时的步数: 260
达到收敛时的步数: 280
达到收敛时的步数: 3100
达到收敛时的步数: 240
达到收敛时的步数: 860
达到收敛时的步数: 73791
达到收敛时的步数: 3435
达到收敛时的步数: 31197
达到收敛时的步数: 1487
达到收敛时的步数: 240
达到收敛时的步数: 700
达到收敛时的步数: 920
达到收敛时的步数: 460
达到收敛时的步数: 600
达到收敛时的步数: 469
达到收敛时的步数: 74355
达到收敛时的步数: 200
达到收敛时的步数: 120
达到收敛时的步数: 1618
达到收敛时的步数: 1110
达到收敛时的步数: 190
达到收敛时的步数: 240
达到收敛时的步数: 160
达到收敛时的步数: 180
达到收敛时的步数: 110
达到收敛时的步数: 500
达到收敛时的步数: 2209
达到收敛时的步数: 140
达到收敛时的步数: 1213
达到收敛时的步数: 110
达到收敛时的步数: 1024
达到收敛时的步数: 120
达到收敛时的步数: 8240
达到收敛时的步数: 380
达到收敛时的步数: 240
达到收敛时的步数: 651
达到收敛时的步数: 260
达到收敛时的步数: 180
达到收敛时的步数: 200
达到收敛时的步数: 240
达到收敛时的步数: 260
达到收敛时的步数: 230
达到收敛时的步数: 120
达到收敛时的步数: 230
达到收敛时的步数: 140
达到收敛时的步数: 120
达到收敛时的步数: 120
达到收敛时的步数: 824
达到收敛时的步数: 340
达到收敛时的步数: 110
达到收敛时的步数: 130
达到收敛时的步数: 1300
达到收敛时的步数: 220
达到收敛时的步数: 150
达到收敛时的步数: 110
达到收敛时的步数: 361
达到收敛时的步数: 1836
达到收敛时的步数: 290
达到收

183

## UCB1算法

In [15]:
run_id = get_run_id(ucb1.__name__)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=ENV,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "UCB1 算法", file_name, x_log=False)
plot_metrics_history(agents, "UCB1 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 18820
达到收敛时的步数: 8590
达到收敛时的步数: 6660
达到收敛时的步数: 6590
达到收敛时的步数: 5550
达到收敛时的步数: 3950
达到收敛时的步数: 22390
达到收敛时的步数: 22450
达到收敛时的步数: 15410
达到收敛时的步数: 4970
达到收敛时的步数: 17210
达到收敛时的步数: 7960
达到收敛时的步数: 4060
达到收敛时的步数: 12460
达到收敛时的步数: 10420
达到收敛时的步数: 18040
达到收敛时的步数: 6360
达到收敛时的步数: 3000
达到收敛时的步数: 20530
达到收敛时的步数: 29139
达到收敛时的步数: 7210
达到收敛时的步数: 8960
达到收敛时的步数: 2810
达到收敛时的步数: 14020
达到收敛时的步数: 5430
达到收敛时的步数: 10331
达到收敛时的步数: 8890
达到收敛时的步数: 6340
达到收敛时的步数: 10120
达到收敛时的步数: 7000
达到收敛时的步数: 21370
avg_regret=2372.679894591704 avg_regret_rate=0.02738743219228435 avg_total_reward=84261.214 avg_optimal_rate=0.1074760800000001 avg_convergence_steps=694.08 avg_convergence_rate=0.062
values=[9013.516, 8583.156, 8159.86, 8107.784, 9129.988, 8706.508, 8225.34, 7849.928, 8313.128, 8172.006] counts=[10747.608, 10128.056, 9683.048, 9656.954, 10804.894, 10352.168, 9747.86, 9328.366, 9887.268, 9663.778]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/

33

# Thompson Sampling 算法

In [16]:
run_id = get_run_id("thompson_sampling")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=ENV,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "TS 算法", file_name, x_log=False)
plot_metrics_history(agents, "TS 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 830
达到收敛时的步数: 6060
达到收敛时的步数: 1090
达到收敛时的步数: 1590
达到收敛时的步数: 1190
达到收敛时的步数: 21900
达到收敛时的步数: 1660
达到收敛时的步数: 4830
达到收敛时的步数: 8860
达到收敛时的步数: 150
达到收敛时的步数: 290
达到收敛时的步数: 930
达到收敛时的步数: 1010
达到收敛时的步数: 200
达到收敛时的步数: 2030
达到收敛时的步数: 330
达到收敛时的步数: 380
达到收敛时的步数: 6330
达到收敛时的步数: 520
达到收敛时的步数: 400
达到收敛时的步数: 3490
达到收敛时的步数: 280
达到收敛时的步数: 220
达到收敛时的步数: 410
达到收敛时的步数: 690
达到收敛时的步数: 690
达到收敛时的步数: 1580
达到收敛时的步数: 15743
达到收敛时的步数: 250
达到收敛时的步数: 160
达到收敛时的步数: 2660
达到收敛时的步数: 590
达到收敛时的步数: 2830
达到收敛时的步数: 820
达到收敛时的步数: 66628
达到收敛时的步数: 460
达到收敛时的步数: 1070
达到收敛时的步数: 2940
达到收敛时的步数: 600
达到收敛时的步数: 730
达到收敛时的步数: 800
达到收敛时的步数: 790
达到收敛时的步数: 76084
达到收敛时的步数: 450
达到收敛时的步数: 3985
达到收敛时的步数: 280
达到收敛时的步数: 1500
达到收敛时的步数: 3370
达到收敛时的步数: 1151
达到收敛时的步数: 1526
达到收敛时的步数: 530
达到收敛时的步数: 1310
达到收敛时的步数: 430
达到收敛时的步数: 880
达到收敛时的步数: 1170
达到收敛时的步数: 8539
达到收敛时的步数: 3340
达到收敛时的步数: 2370
达到收敛时的步数: 1590
达到收敛时的步数: 605
达到收敛时的步数: 5286
达到收敛时的步数: 320
达到收敛时的步数: 360
达到收敛时的步数: 1580
达到收敛时的步数: 11250
达到收敛时的步数: 24730
达到收敛时的步数: 4310
达到收敛时的步数: 330
达到收敛时的

2280886