# 导入必要库

In [1]:
from pathlib import Path
import gc
import time

from core import RLEnv
from core.agent import BaseAgent
from greedy import (
    EpsilonDecreasingConfig,
    GreedyAgent,
    greedy_average,
    epsilon_average,
    epsilon_decreasing_average,
)
from ucb1 import UCBAgent, ucb1
from thompson_sampling import TSAgent

from train import batch_train
from utils import plot_metrics_history, save_experiment_data, ProcessDataLogger

In [2]:
STEPS: int = 100_000
GRID_SIZE: int = 500

SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 50
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"

ENV: RLEnv = RLEnv(machine_count=MACHINE_COUNT, seed=SEED)
EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [3]:
def get_run_id(agent_name: str) -> str:
    return agent_name + "_" + str(time.time())

In [4]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=greedy_average.__name__,
        env=env,
        greedy_algorithm=greedy_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )


def create_epsilon_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=epsilon_average.__name__,
        env=env,
        greedy_algorithm=epsilon_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )


def create_decreasing_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=epsilon_decreasing_average.__name__,
        env=env,
        greedy_algorithm=epsilon_decreasing_average,
        epsilon_config=epsilon_config,
        optimistic_init=optimistic_init,
        optimistic_times=optimistic_times,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

In [5]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name=ucb1.__name__,
        env=env,
        ucb1_algorithm=ucb1,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

In [6]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name=TSAgent.__name__,
        env=env,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        seed=seed,
    )

# 训练

## 普通贪婪算法

In [7]:
run_id = get_run_id(greedy_average.__name__)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=ENV,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "贪婪算法", file_name, x_log=False)
plot_metrics_history(agents, "贪婪算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 190
达到收敛时的步数: 510
达到收敛时的步数: 250
达到收敛时的步数: 130
达到收敛时的步数: 560
达到收敛时的步数: 150
达到收敛时的步数: 160
达到收敛时的步数: 170
达到收敛时的步数: 150
达到收敛时的步数: 200
达到收敛时的步数: 250
达到收敛时的步数: 180
达到收敛时的步数: 440
达到收敛时的步数: 140
达到收敛时的步数: 190
达到收敛时的步数: 190
达到收敛时的步数: 1190
达到收敛时的步数: 390
达到收敛时的步数: 140
达到收敛时的步数: 190
达到收敛时的步数: 130
达到收敛时的步数: 210
达到收敛时的步数: 300
达到收敛时的步数: 210
达到收敛时的步数: 520
达到收敛时的步数: 210
达到收敛时的步数: 140
达到收敛时的步数: 520
达到收敛时的步数: 130
达到收敛时的步数: 270
达到收敛时的步数: 210
达到收敛时的步数: 160
达到收敛时的步数: 480
达到收敛时的步数: 410
avg_regret=3996.850909090912 avg_regret_rate=0.04396536000000002 avg_total_reward=86912.24 avg_optimal_rate=0.6798143999999999 avg_convergence_steps=193.4 avg_convergence_rate=0.68
values=[0.5, 8725.78, 2.62, 0.72, 0.26, 1.34, 16376.84, 61803.06, 0.14, 0.98] counts=[1.54, 12002.72, 4.06, 1.74, 1.26, 2.46, 20001.64, 67981.44, 1.14, 2.0]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/experiment_data/greedy_average_1757851137

33

## UCB1算法

In [8]:
run_id = get_run_id(ucb1.__name__)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=ENV,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "UCB1 算法", file_name, x_log=False)
plot_metrics_history(agents, "UCB1 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 20550
达到收敛时的步数: 25740
达到收敛时的步数: 24000
达到收敛时的步数: 24400
达到收敛时的步数: 24230
达到收敛时的步数: 33740
达到收敛时的步数: 24630
达到收敛时的步数: 29670
达到收敛时的步数: 26380
达到收敛时的步数: 19370
达到收敛时的步数: 23990
达到收敛时的步数: 23520
达到收敛时的步数: 25750
达到收敛时的步数: 22280
达到收敛时的步数: 27480
达到收敛时的步数: 25100
达到收敛时的步数: 22840
达到收敛时的步数: 24090
达到收敛时的步数: 24890
达到收敛时的步数: 24650
达到收敛时的步数: 20870
达到收敛时的步数: 24570
达到收敛时的步数: 25730
达到收敛时的步数: 22000
达到收敛时的步数: 24630
达到收敛时的步数: 26310
达到收敛时的步数: 27790
达到收敛时的步数: 30100
达到收敛时的步数: 31580
达到收敛时的步数: 20090
达到收敛时的步数: 27700
达到收敛时的步数: 20480
达到收敛时的步数: 19880
达到收敛时的步数: 20070
达到收敛时的步数: 25670
达到收敛时的步数: 22890
达到收敛时的步数: 24420
达到收敛时的步数: 19100
达到收敛时的步数: 24310
达到收敛时的步数: 23840
达到收敛时的步数: 22450
达到收敛时的步数: 20000
达到收敛时的步数: 23020
达到收敛时的步数: 21860
达到收敛时的步数: 26010
达到收敛时的步数: 23310
达到收敛时的步数: 28860
达到收敛时的步数: 19730
达到收敛时的步数: 25060
达到收敛时的步数: 27830
avg_regret=609.3109090909118 avg_regret_rate=0.00670242000000003 avg_total_reward=90299.78 avg_optimal_rate=0.9663803999999999 avg_convergence_steps=24349.2 avg_convergence_rate=1.0
values=[13.58, 436

68

# Thompson Sampling 算法

In [9]:
run_id = get_run_id("thompson_sampling")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}_T={STEPS}_K={MACHINE_COUNT}_Q_0={OPTIMISTIC_TIMES}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=ENV,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "TS 算法", file_name, x_log=False)
plot_metrics_history(agents, "TS 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

del agents, reward, metrics, process_logger, dump
gc.collect()

达到收敛时的步数: 2900
达到收敛时的步数: 980
达到收敛时的步数: 3950
达到收敛时的步数: 710
达到收敛时的步数: 610
达到收敛时的步数: 1100
达到收敛时的步数: 3640
达到收敛时的步数: 2750
达到收敛时的步数: 1430
达到收敛时的步数: 1020
达到收敛时的步数: 2010
达到收敛时的步数: 790
达到收敛时的步数: 1530
达到收敛时的步数: 2460
达到收敛时的步数: 520
达到收敛时的步数: 2370
达到收敛时的步数: 1230
达到收敛时的步数: 3250
达到收敛时的步数: 1840
达到收敛时的步数: 460
达到收敛时的步数: 1130
达到收敛时的步数: 1420
达到收敛时的步数: 1030
达到收敛时的步数: 600
达到收敛时的步数: 670
达到收敛时的步数: 2330
达到收敛时的步数: 610
达到收敛时的步数: 2110
达到收敛时的步数: 290
达到收敛时的步数: 1240
达到收敛时的步数: 1140
达到收敛时的步数: 1050
达到收敛时的步数: 1020
达到收敛时的步数: 810
达到收敛时的步数: 5240
达到收敛时的步数: 1340
达到收敛时的步数: 970
达到收敛时的步数: 1520
达到收敛时的步数: 2300
达到收敛时的步数: 420
达到收敛时的步数: 1600
达到收敛时的步数: 2230
达到收敛时的步数: 770
达到收敛时的步数: 670
达到收敛时的步数: 5620
达到收敛时的步数: 450
达到收敛时的步数: 1150
达到收敛时的步数: 2670
达到收敛时的步数: 390
达到收敛时的步数: 930
avg_regret=49.61090909091174 avg_regret_rate=0.0005457200000000289 avg_total_reward=90859.48 avg_optimal_rate=0.9971712000000003 avg_convergence_steps=1585.4 avg_convergence_rate=1.0
values=[2.6, 32.76, 15.46, 3.24, 1.12, 8.28, 128.64, 90662.26, 0.42, 4.7] counts=[8.4

228748