In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

# ---------------------------
# 环境定义（固定10000步版本）
# ---------------------------
class UAVEnv:
    def __init__(self):
        # 地点表示: 0: L0 (Home), 1: L1, 2: L2
        # 物理参数
        self.d_L0_L1 = 10
        self.d_L0_L2 = 20
        self.d_L1_L2 = 15  # 假设 L1 到 L2距离（对称）
        self.charging_rate = 20  # 充电速率
        self.remove_cost = 5     # 消除警报消耗能量（可调）
        self.battery_capacity = 80
        
        # 警报生成概率（独立变量）
        self.p_L1 = 0.017
        self.p_L2 = 0.012

        # 调整惩罚和奖励参数
        self.max_alarm_age = 100  # 最大等待时间
        self.anchor_penalty = 50  # 降低抛锚惩罚（不再直接终止）
        self.max_age_penalty = 50
        self.clear_reward_base = 10

        # 固定步数为10000
        self.max_steps = 10000      
        self.alarm_penalty_rate = 2
        
        self.reset()

    def reset(self):
        # 当前时间步
        self.t = 0
        # UAV位置，初始在L0(Home)
        self.pos = 0
        # 充满电
        self.battery = self.battery_capacity
        
        # 对每个地点的警报状态：如果有警报，记录等待时间（初始值为1）
        # 如果没有警报，记为0
        self.alarm_L1 = 0  
        self.alarm_L2 = 0  
        
        # 累计 AoMA
        self.aoa_total = 0
        # 性能统计
        self.alarm_count = 0
        self.anchor_count = 0  # 抛锚次数
        self.max_age_violations = 0  # 超时次数
        self.alarms_cleared = 0  # 成功清除的警报数
        
        # 新增状态属性
        self.stop_reason = "NO"
        
        return self._get_state()

    def _get_state(self):
        # 状态: [当前所在位置(one-hot 3维), 剩余电量归一化, alarm_L1等待时间归一化, alarm_L2等待时间归一化]
        pos_onehot = [1 if self.pos == i else 0 for i in range(3)]
        norm_battery = self.battery / self.battery_capacity
        # 用最大允许等待时长做归一化并截断到 [0,1]
        norm_alarm1 = min(self.alarm_L1, self.max_alarm_age) / self.max_alarm_age
        norm_alarm2 = min(self.alarm_L2, self.max_alarm_age) / self.max_alarm_age
        state = np.array(pos_onehot + [norm_battery, norm_alarm1, norm_alarm2], dtype=np.float32)
        return state

    def _get_distance(self, from_pos, to_pos):
        if from_pos == to_pos:
            return 0
        if {from_pos, to_pos} == {0, 1}:
            return self.d_L0_L1
        if {from_pos, to_pos} == {0, 2}:
            return self.d_L0_L2
        if {from_pos, to_pos} == {1, 2}:
            return self.d_L1_L2
        return 0

    def step(self, action):
        """
        动作定义:
         0: 移动到 L0 (如果已在L0，则代表在Home充电)
         1: 移动到 L1
         2: 移动到 L2
         3: 执行消除警报动作（前提是在对应地点且有警报）
        """
        reward = 0.0
        delta_time = 1

        # 处理动作
        if action in [0, 1, 2]:
            target = action  # 目标地点
            if self.pos != target:
                # 计算移动需要的耗时和能量
                dist = self._get_distance(self.pos, target)
                # 如果电量不足以完成移动，则给予惩罚但不终止episode
                if self.battery < dist:
                    self.anchor_count += 1
                    self.stop_reason = "Anchored"
                    reward -= self.anchor_penalty
                    # 强制充电：瞬间回到L0并充满电
                    self.pos = 0
                    self.battery = self.battery_capacity
                    delta_time = dist  # 保持时间消耗
                else:
                    # 正常飞行
                    self.battery -= dist
                    delta_time = dist
                    self.pos = target
            
            # 如果在L0，则可以充电
            if self.pos == 0:
                charge = self.charging_rate * delta_time
                self.battery = min(self.battery + charge, self.battery_capacity)
                
        elif action == 3:
            # 执行消除警报动作
            if self.pos == 1 and self.alarm_L1 > 0:
                if self.battery < self.remove_cost:
                    self.anchor_count += 1
                    self.stop_reason = "Anchored"
                    reward -= self.anchor_penalty
                    # 强制充电
                    self.pos = 0
                    self.battery = self.battery_capacity
                else:
                    self.battery -= self.remove_cost
                    t = self.alarm_L1
                    self.aoa_total += t * (t + 1) / 2
                    time_bonus = max(-5, 60 - (self.alarm_L1+1)*(self.alarm_L1+2)/40)
                    reward += time_bonus
                    self.alarm_L1 = 0
                    self.alarms_cleared += 1
                    
            elif self.pos == 2 and self.alarm_L2 > 0:
                if self.battery < self.remove_cost:
                    self.anchor_count += 1
                    self.stop_reason = "Anchored"
                    reward -= self.anchor_penalty
                    # 强制充电
                    self.pos = 0
                    self.battery = self.battery_capacity
                else:
                    self.battery -= self.remove_cost
                    t = self.alarm_L2
                    self.aoa_total += t * (t + 1) / 2
                    time_bonus = max(-5, 60 - (self.alarm_L2+1)*(self.alarm_L2+2)/40)
                    reward += time_bonus
                    self.alarm_L2 = 0
                    self.alarms_cleared += 1
            else:
                # 无效的消除动作
                reward -= 20
        else:
            # 非法动作
            reward -= 20

        # 时间步进
        for _ in range(delta_time):
            self.t += 1
            
            # 更新现有警报的等待时间
            if self.alarm_L1 > 0:
                self.alarm_L1 += 1
            if self.alarm_L2 > 0:
                self.alarm_L2 += 1
            
            # 持续惩罚
            alarm_penalty = 0
            if self.alarm_L1 > 0:
                alarm_penalty += self.alarm_penalty_rate*(0.8+ 1.5*self.alarm_L1/self.max_alarm_age)
            if self.alarm_L2 > 0:
                alarm_penalty += self.alarm_penalty_rate*(0.8+ 1.5*self.alarm_L2/self.max_alarm_age)
            reward -= alarm_penalty
            
            # 检查警报是否超过最大等待时间（给予惩罚但不终止）
            if self.alarm_L1 > self.max_alarm_age:
                self.max_age_violations += 1
                reward -= self.max_age_penalty
                # 重置超时警报
                self.aoa_total += self.alarm_L1 * (self.alarm_L1 + 1) / 2
                self.alarm_L1 = 0
                
            if self.alarm_L2 > self.max_alarm_age:
                self.max_age_violations += 1
                reward -= self.max_age_penalty
                # 重置超时警报
                self.aoa_total += self.alarm_L2 * (self.alarm_L2 + 1) / 2
                self.alarm_L2 = 0
            
            # 尝试生成新的警报
            if self.alarm_L1 == 0 and random.random() < self.p_L1:
                self.alarm_L1 = 1
                self.alarm_count += 1
            if self.alarm_L2 == 0 and random.random() < self.p_L2:
                self.alarm_L2 = 1
                self.alarm_count += 1

        # 电量安全奖励
        if self.battery > 40:
            reward += 0.5
        elif self.battery < 20:
            reward -= (self.battery_capacity - self.battery) + 60

        # 固定在10000步结束
        done = False
        if self.t >= self.max_steps or self.battery <= 0:
            done = True
            self.stop_reason = "✅"
            if self.battery <= 0:
                self.anchored = True
                self.stop_reason = "Anchored"
                reward -= self.anchor_penalty

        return self._get_state(), reward, done, {}
# ---------------------------
# 贪心策略函数（修正版）
# ---------------------------
def select_action_greedy(env):
    """
    改进的贪心策略：
    1. 考虑无人机当前位置计算准确的能耗
    2. 优先处理年龄更大的警报
    3. 如果电量不够完成任务+返回L0，则先回L0充电
    4. 无警报时待在L0充电
    """
    alarm_at_L1 = (env.alarm_L1 > 0)
    alarm_at_L2 = (env.alarm_L2 > 0)
    
    # 如果没有警报，回L0充电
    if not alarm_at_L1 and not alarm_at_L2:
        return 0
    
    # 计算处理每个警报的成本（基于当前位置）
    tasks = []
    
    if alarm_at_L1:
        cost_to_reach_L1 = env._get_distance(env.pos, 1)
        cost_to_clear = env.remove_cost
        cost_to_return_from_L1 = env._get_distance(1, 0)
        total_cost_L1 = cost_to_reach_L1 + cost_to_clear + cost_to_return_from_L1
        
        tasks.append({
            'location': 1,
            'alarm_age': env.alarm_L1,
            'total_cost': total_cost_L1,
            'cost_to_reach': cost_to_reach_L1,
            'cost_to_clear_and_return': cost_to_clear + cost_to_return_from_L1
        })
    
    if alarm_at_L2:
        cost_to_reach_L2 = env._get_distance(env.pos, 2)
        cost_to_clear = env.remove_cost
        cost_to_return_from_L2 = env._get_distance(2, 0)
        total_cost_L2 = cost_to_reach_L2 + cost_to_clear + cost_to_return_from_L2
        
        tasks.append({
            'location': 2,
            'alarm_age': env.alarm_L2,
            'total_cost': total_cost_L2,
            'cost_to_reach': cost_to_reach_L2,
            'cost_to_clear_and_return': cost_to_clear + cost_to_return_from_L2
        })
    
    # 按警报年龄排序（年龄大的优先）
    tasks.sort(key=lambda x: x['alarm_age'], reverse=True)
    
    # 尝试执行优先级最高的任务
    for task in tasks:
        target_loc = task['location']
        
        if env.pos == target_loc:
            # 已在目标位置，检查是否有足够电量消除警报并返回
            if env.battery >= task['cost_to_clear_and_return']:
                return 3  # 消除警报
            else:
                return 0  # 电量不足，回L0充电
        else:
            # 不在目标位置，检查是否有足够电量完成整个任务
            if env.battery >= task['total_cost']:
                return target_loc  # 前往目标位置
            # 如果电量不足，继续检查下一个优先级的任务
    
    # 所有任务都无法完成，回L0充电
    return 0

# ---------------------------
# 训练函数（使用贪心策略）
# ---------------------------
def train_greedy(num_episodes=1000):
    env = UAVEnv()
    
    episode_rewards = []
    episode_aoa = []
    episode_stats = []
    
    for ep in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        # 固定执行10000步
        while not done:
            action = select_action_greedy(env)
            next_state, reward, done, info = env.step(action)
            state = next_state
            total_reward += reward
        
        # 计算平均 AoMA
        avg_aoa = env.aoa_total / env.alarm_count if env.alarm_count > 0 else 0
        
        # 新增：按需求格式打印
        print(
            f"Episode {ep+1:4d} | Reward: {total_reward:10.2f}"
            f" | Avg AoMA: {avg_aoa:8.2f} ({env.aoa_total:8} / {env.alarm_count:2d})"
            f" | Early Stop: {env.stop_reason}"
            f" | Stopped at: {env.t}"
        )
        
        episode_rewards.append(total_reward)
        episode_aoa.append(avg_aoa)
        
        # 收集详细统计
        episode_stats.append({
            'total_alarms': env.alarm_count,
            'cleared_alarms': env.alarms_cleared,
            'anchor_count': env.anchor_count,
            'max_age_violations': env.max_age_violations,
            'avg_aoa': avg_aoa,
            'total_reward': total_reward
        })
    
        # 每100集打印一次信息
        if (ep + 1) % 100 == 0:
            recent_stats = episode_stats[-100:]
            avg_reward = np.mean([s['total_reward'] for s in recent_stats])
            avg_anchors = np.mean([s['anchor_count'] for s in recent_stats])
            avg_violations = np.mean([s['max_age_violations'] for s in recent_stats])
            avg_aoa_recent = np.mean([s['avg_aoa'] for s in recent_stats])
            
            print(f"Episode {ep+1:4d} | "
                  f"Avg Reward: {avg_reward:8.2f} | "
                  f"Avg AoMA: {avg_aoa_recent:6.2f} | "
                  f"Anchors: {avg_anchors:4.1f} | "
                  f"Violations: {avg_violations:4.1f}")
    
    return episode_rewards, episode_aoa, episode_stats

print("开始运行固定10000步的贪心策略...")
greedy_rewards, greedy_aoa, greedy_stats = train_greedy(num_episodes=100)

# 打印总体平均 AoMA
import numpy as np
print(f"平均AoMA: {np.mean(greedy_aoa):.2f}")

# # 绘制贪心策略的固定步数性能
# print("\n生成贪心策略固定10000步性能图表...")
# greedy_performance = plot_fixed_step_performance(greedy_rewards, greedy_aoa, greedy_stats, 
#                                                 "贪心策略 - 固定10000步性能分析")

开始运行固定10000步的贪心策略...
Episode    1 | Reward:    5585.34 | Avg AoMA:   211.11 ( 50033.0 / 237) | Early Stop: ✅ | Stopped at: 10014
Episode    2 | Reward:    3987.90 | Avg AoMA:   253.22 ( 61280.0 / 242) | Early Stop: ✅ | Stopped at: 10001
Episode    3 | Reward:    4320.05 | Avg AoMA:   242.91 ( 55869.0 / 230) | Early Stop: ✅ | Stopped at: 10003
Episode    4 | Reward:    4924.49 | Avg AoMA:   229.78 ( 52390.0 / 228) | Early Stop: ✅ | Stopped at: 10004
Episode    5 | Reward:    4745.53 | Avg AoMA:   234.44 ( 50640.0 / 216) | Early Stop: ✅ | Stopped at: 10000
Episode    6 | Reward:    4039.07 | Avg AoMA:   253.54 ( 56793.0 / 224) | Early Stop: ✅ | Stopped at: 10012
Episode    7 | Reward:    5109.94 | Avg AoMA:   225.64 ( 46031.0 / 204) | Early Stop: ✅ | Stopped at: 10000
Episode    8 | Reward:    3683.73 | Avg AoMA:   266.67 ( 57067.0 / 214) | Early Stop: ✅ | Stopped at: 10003
Episode    9 | Reward:    5192.19 | Avg AoMA:   216.11 ( 50354.0 / 233) | Early Stop: ✅ | Stopped at: 10005
Episode