# 导入必要库

In [1]:
from pathlib import Path
import time

from core import RLEnv
from core import BaseAgent
from core.schemas import PiecewizeMethod  # noqa
from greedy import EpsilonDecreasingConfig, GreedyAgent, GreedyAlgorithm, GreedyType
from ucb1 import UCBAgent, UCB1Algorithm
from thompson_sampling import TSAgent, TSAlgorithm

from train import batch_train
from utils import plot_metrics_history, save_experiment_data, ProcessDataLogger, clear_var

In [2]:
STEPS: int = 100_000
GRID_SIZE: int = 500
NUM_WORKERS: int = 10

SEED: int = 42
MACHINE_COUNT: int = 10
RUN_COUNT: int = 500
CONVERGENCE_THRESHOLD: float = 0.9
CONVERGENCE_MIN_STEPS: int = 100
OPTIMISTIC_TIMES: int = 1
ENABLE_OPTIMISTIC: bool = True
EXPERIMENT_DATA_DIR: Path = Path.cwd() / "experiment_data"
CONSTANT_STEPSIZE: float = 0.01
DISCOUNT_FACTOR: float = 0.9

ENV: RLEnv = RLEnv(
    machine_count=MACHINE_COUNT,
    random_walk_internal=1,
    random_walk_machine_num=1,
    # piecewise_internal=1,
    # piecewize_method=PiecewizeMethod.UPSIDE_DOWN,
    seed=SEED,
)
EPSILON_CONFIG: EpsilonDecreasingConfig = EpsilonDecreasingConfig()

# 工厂函数

In [3]:
def get_run_id(agent_name: str) -> str:
    return agent_name + "_" + str(time.time())

In [4]:
def create_greedy_agent(
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    optimistic_init: bool,
    optimistic_times: int,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return GreedyAgent(
        name=GreedyType.GREEDY,
        env=env,
        algorithm=GreedyAlgorithm(
            greedy_type=GreedyType.GREEDY,
            optimistic_init=optimistic_init,
            optimistic_times=optimistic_times,
        ),
        epsilon_config=epsilon_config,
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [5]:
def create_ucb1_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    constant_stepsize: float,
    seed: int,
) -> BaseAgent:
    return UCBAgent(
        name="UCB1",
        env=env,
        algorithm=UCB1Algorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        constant_stepsize=constant_stepsize,
        seed=seed,
    )

In [6]:
def create_ts_agent(
    env: RLEnv,
    convergence_threshold: float,
    convergence_min_steps: int,
    discount_factor: float,
    seed: int,
) -> BaseAgent:
    return TSAgent(
        name="Thompson Sampling",
        env=env,
        algorithm=TSAlgorithm(),
        convergence_threshold=convergence_threshold,
        convergence_min_steps=convergence_min_steps,
        discount_factor=discount_factor,
        seed=seed,
    )

# 训练

## 普通贪婪算法

In [7]:
run_id = get_run_id(GreedyType.GREEDY)
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_greedy_agent,
    env=env,
    epsilon_config=EPSILON_CONFIG,
    steps=STEPS,
    seed=SEED,
    optimistic_init=ENABLE_OPTIMISTIC,
    optimistic_times=OPTIMISTIC_TIMES,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "贪婪算法", file_name, x_log=False)
plot_metrics_history(agents, "贪婪算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 100达到收敛时的步数: 100
达到收敛时的步数: 100达到收敛时的步数: 130达到收敛时的步数: 100



达到收敛时的步数: 100
达到收敛时的步数: 2121
达到收敛时的步数: 4172
达到收敛时的步数: 287
达到收敛时的步数: 239
达到收敛时的步数: 170
达到收敛时的步数: 20568
达到收敛时的步数: 7233
达到收敛时的步数: 11257
达到收敛时的步数: 24158
达到收敛时的步数: 100
达到收敛时的步数: 3075
达到收敛时的步数: 551
达到收敛时的步数: 23264
达到收敛时的步数: 100
达到收敛时的步数: 1455
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 64727
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 5521
达到收敛时的步数: 3218
达到收敛时的步数: 6335
达到收敛时的步数: 3261
达到收敛时的步数: 10384
达到收敛时的步数: 2919
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 2739
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 2241
达到收敛时的步数: 225
达到收敛时的步数: 58034
达到收敛时的步数: 100
达到收敛时的步数: 127
达到收敛时的步数: 100
达到收敛时的步数: 100
达到收敛时的步数: 6800
达到收敛时的步数: 100
达到收敛时的步数: 71164
达到收敛时的步数: 5398
达到收敛时的步数: 120
达到收敛时的步数: 8795
达到收敛时的步数: 5961
达到收敛时的步数: 100
达到收敛时的步数: 120
达到收敛时的步数: 4381
达到收敛时的步数: 18614
达到收敛时的步数: 10740
达到收敛时的步数: 140
达到收敛时的步数: 100
达到收敛时的步数: 584
达到收敛时的步数: 100
达到收敛时的步数: 13816
达到收敛时的步数: 120
达到收敛时的步数: 100
达到收敛时的步数: 110
达到收敛时的步数: 100
达到收敛时的步数: 55104
达到收敛时的步数: 116
达到收敛时的步数: 100
达到收敛时的步

## UCB1算法

In [8]:
run_id = get_run_id("ucb1")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ucb1_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    constant_stepsize=CONSTANT_STEPSIZE,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "UCB1 算法", file_name, x_log=False)
plot_metrics_history(agents, "UCB1 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 1980
达到收敛时的步数: 2220
达到收敛时的步数: 2026达到收敛时的步数: 1780

达到收敛时的步数: 2280
达到收敛时的步数: 2751
达到收敛时的步数: 2320
达到收敛时的步数: 2160
达到收敛时的步数: 1830
达到收敛时的步数: 1920
达到收敛时的步数: 2000
达到收敛时的步数: 2281
达到收敛时的步数: 2104
达到收敛时的步数: 2310
达到收敛时的步数: 2080
达到收敛时的步数: 11310
达到收敛时的步数: 2280
达到收敛时的步数: 1790
达到收敛时的步数: 2370
达到收敛时的步数: 2480
达到收敛时的步数: 2050
达到收敛时的步数: 1828
达到收敛时的步数: 2590
达到收敛时的步数: 2450
达到收敛时的步数: 2430
达到收敛时的步数: 2000
达到收敛时的步数: 2690
达到收敛时的步数: 1870
达到收敛时的步数: 3029
达到收敛时的步数: 1810
达到收敛时的步数: 8605
达到收敛时的步数: 2080
达到收敛时的步数: 2110
达到收敛时的步数: 2270
达到收敛时的步数: 1730
达到收敛时的步数: 2690
达到收敛时的步数: 1760
达到收敛时的步数: 1710
达到收敛时的步数: 2200
达到收敛时的步数: 1730
达到收敛时的步数: 2862
达到收敛时的步数: 1720
达到收敛时的步数: 2960
达到收敛时的步数: 2300
达到收敛时的步数: 1907
达到收敛时的步数: 2170
达到收敛时的步数: 2598
达到收敛时的步数: 1910
达到收敛时的步数: 1930
达到收敛时的步数: 1760
达到收敛时的步数: 1880
达到收敛时的步数: 2140
达到收敛时的步数: 2530
达到收敛时的步数: 2340
达到收敛时的步数: 1770
达到收敛时的步数: 1950
达到收敛时的步数: 1861
达到收敛时的步数: 1840
达到收敛时的步数: 2130
达到收敛时的步数: 2580
达到收敛时的步数: 2074
达到收敛时的步数: 1680
达到收敛时的步数: 2520
达到收敛时的步数: 2060
达到收敛时的步数: 1820
达到收敛时的步数: 2820
达到收敛时的步数:

# Thompson Sampling 算法

In [9]:
run_id = get_run_id("ts")
file_name: Path = (
    EXPERIMENT_DATA_DIR
    / f"{run_id}.png"
)
process_logger = ProcessDataLogger(
    run_id=run_id,
    total_steps=STEPS,
    grid_size=GRID_SIZE,
)
env = ENV.clone()

agents, reward, metrics = batch_train(
    count=RUN_COUNT,
    agent_factory=create_ts_agent,
    env=env,
    steps=STEPS,
    seed=SEED,
    convergence_threshold=CONVERGENCE_THRESHOLD,
    convergence_min_steps=CONVERGENCE_MIN_STEPS,
    process_logger=process_logger,
    discount_factor=DISCOUNT_FACTOR,
    num_workers=NUM_WORKERS,
)
print(metrics)
print(reward)

plot_metrics_history(agents, "TS 算法", file_name, x_log=False)
plot_metrics_history(agents, "TS 算法", file_name, x_log=True)
save_experiment_data(reward, metrics, file_name)
process_logger.save(file_name.with_stem(file_name.stem + "process"), total_steps=STEPS)
dump = process_logger.export(total_steps=STEPS)
keys = list(dump.points[0].data.keys())

print(ENV.best_reward(1000))
for m in ENV.machines:
    print(m.reward_probability)

clear_var(env, agents, reward, metrics, process_logger, dump)

达到收敛时的步数: 580
达到收敛时的步数: 221
达到收敛时的步数: 2310
达到收敛时的步数: 450
达到收敛时的步数: 300
达到收敛时的步数: 210
达到收敛时的步数: 200
达到收敛时的步数: 860
达到收敛时的步数: 4400
达到收敛时的步数: 170
达到收敛时的步数: 730
达到收敛时的步数: 170
达到收敛时的步数: 100
达到收敛时的步数: 1410
达到收敛时的步数: 650
达到收敛时的步数: 740
达到收敛时的步数: 530
达到收敛时的步数: 660
达到收敛时的步数: 8500
达到收敛时的步数: 3180
达到收敛时的步数: 960
达到收敛时的步数: 580
达到收敛时的步数: 6480
达到收敛时的步数: 590
达到收敛时的步数: 250
达到收敛时的步数: 660
达到收敛时的步数: 400
达到收敛时的步数: 1250
达到收敛时的步数: 6160
达到收敛时的步数: 1680
达到收敛时的步数: 200
达到收敛时的步数: 430
达到收敛时的步数: 290
avg_regret=3042.3525985218284 avg_regret_rate=0.025344686070432103 avg_total_reward=86675.612 avg_optimal_rate=0.15593688 avg_convergence_steps=92.602 avg_convergence_rate=0.066
values=[8938.282, 8544.82, 8775.028, 8714.386, 8117.32, 9096.252, 8354.636, 9125.53, 8369.94, 8639.418] counts=[10272.94, 9860.322, 10140.022, 10040.71, 9352.564, 10506.758, 9659.37, 10510.814, 9695.54, 9960.96]
✅ 字体文件 /home/Jese__Ki/Projects/learn/Python/rl_atomic/bandit/assets/微软雅黑.ttf 已加载
✅ 图表已保存至 /home/Jese__Ki/Projects/learn/Python/rl_atomic/ba