# 贪婪算法的不同和优化

## 现在将创建不同的 Agent 并统计平均值，后悔值，命中率

In [1]:
from typing import Callable, List, Tuple

from core import EpsilonDecreasingConfig, GreedyAgent, Rewards, RLEnv
from train import train, AverageMetrics
from algorithms import (
    greedy_normal,
    epsilon_greedy,
    epsilon_decreasing_greedy,
    greedy_average,
    epsilon_average,
    epsilon_decreasing_average,
)

In [2]:
SEED = 42
env = RLEnv(seed=SEED)
COUNT = 50
STEPS = 1000

In [None]:
def batch_train(
    count: int,
    greedy_algorithm: Callable[..., int],
    env: RLEnv,
    epsilon_config: EpsilonDecreasingConfig,
    steps: int,
    seed: int,
) -> Tuple[List[GreedyAgent], Rewards, AverageMetrics]:
    """批训练 Agent，传入数量，不同的算法，环境，步数和初始种子即可训练

    Args:
        count (int): 训练数量
        agent (GreedyAgent): 算法 类型
        env (RLEnv): 环境
        steps (int): 步数
        seed (int): 初始种子

    Returns:
        Tuple[List[GreedyAgent], Rewards]: 返回训练后的 agents 和平均后的奖励
    """
    _agents: List[GreedyAgent] = []

    if not callable(greedy_algorithm):
        raise ValueError("算法必须传入一个函数")

    for i in range(count):
        _agents.append(
            GreedyAgent(
                name=greedy_algorithm.__name__,  # type: ignore # 在 callable 这里就已经验证了是一个函数，这里是为了避免 ty 工具误报
                env=env,
                greedy_algorithm=greedy_algorithm,
                epsilon_config=epsilon_config,
                seed=seed + i,
            )
        )

    agents, reward, metrics = train(_agents, steps)

    return agents, reward, metrics

## 累积奖励

### 普通贪婪算法

In [4]:
# 普通贪婪算法的结果
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=greedy_normal,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: greedy_normal
平均奖励：Rewards(values=[90.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], counts=[1000.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
指标：AverageMetrics(avg_regret=818.8709090909091, avg_regret_rate=0.9007579999999998, avg_total_reward=90.22, avg_optimal_rate=0.0)


### 随机探索贪婪算法

In [5]:
# 随机探索贪婪算法的结果
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=epsilon_greedy,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: epsilon_greedy
平均奖励：Rewards(values=[41.9, 4.9, 7.46, 35.92, 20.38, 23.6, 50.8, 46.7, 52.02, 91.64], counts=[445.28, 27.96, 27.84, 99.06, 46.16, 44.22, 80.8, 64.38, 63.72, 100.58])
指标：AverageMetrics(avg_regret=533.7709090909085, avg_regret_rate=0.5871479999999998, avg_total_reward=375.32, avg_optimal_rate=0.10057999999999997)


### 退火随机探索贪婪算法

In [6]:
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=epsilon_decreasing_greedy,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: epsilon_decreasing_greedy
平均奖励：Rewards(values=[2.04, 6.72, 13.56, 7.06, 30.56, 63.26, 104.98, 97.02, 201.04, 133.52], counts=[18.76, 36.82, 51.22, 20.02, 67.82, 115.8, 165.12, 132.28, 245.14, 147.02])
指标：AverageMetrics(avg_regret=249.33090909090876, avg_regret_rate=0.2742639999999999, avg_total_reward=659.76, avg_optimal_rate=0.14701999999999996)


## 平均奖励

### 普通贪婪算法

In [7]:
# 普通贪婪算法的结果
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=greedy_average,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: greedy_average
平均奖励：Rewards(values=[91.42, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], counts=[1000.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
指标：AverageMetrics(avg_regret=817.6709090909089, avg_regret_rate=0.899438, avg_total_reward=91.42, avg_optimal_rate=0.0)


### 随机探索贪婪算法

In [8]:
# 随机探索贪婪算法的结果
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=epsilon_average,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: epsilon_average
平均奖励：Rewards(values=[2.7, 1.86, 3.32, 4.82, 6.18, 7.66, 13.6, 27.66, 167.46, 585.36], counts=[29.44, 10.44, 11.0, 13.02, 13.28, 14.72, 21.62, 38.22, 203.9, 644.36])
指标：AverageMetrics(avg_regret=88.470909090909, avg_regret_rate=0.09731799999999996, avg_total_reward=820.62, avg_optimal_rate=0.64436)


### 退火随机探索贪婪算法

In [9]:
agnts, reward, metrics = batch_train(
    count=COUNT,
    greedy_algorithm=epsilon_decreasing_average,
    env=env,
    epsilon_config=EpsilonDecreasingConfig(),
    steps=STEPS,
    seed=SEED,
)

print(f"anget 名称: {agnts[0].name}\n平均奖励：{reward}\n指标：{metrics}")

anget 名称: epsilon_decreasing_average
平均奖励：Rewards(values=[1.62, 3.92, 4.8, 7.46, 9.02, 11.04, 14.76, 15.74, 91.86, 655.74], counts=[18.7, 20.7, 18.94, 20.3, 19.86, 20.1, 23.04, 22.5, 111.84, 724.02])
指标：AverageMetrics(avg_regret=93.13090909090901, avg_regret_rate=0.10244399999999995, avg_total_reward=815.96, avg_optimal_rate=0.72402)
