# 共享层和专用层分别处理

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import gym
from gym import spaces

# 假设已有的 name_count, A, B 数据
name_count = {
    'var1': 1, 'var2': 2, 'var3': 3,  # 示例数据，实际应包含所有73个配置
    # ...
}

# 定义 A 和 B
A = np.random.uniform(-1, 1, size=(3, 73))  # 假设 A 的真实数据
B = np.random.uniform(-1, 1, size=(3,))    # 假设 B 的真实数据

# 解析配置变量
variables = list(name_count.keys())
participation_count = list(name_count.values())

# 找出共享变量的索引
shared_indices = [i for i, count in enumerate(participation_count) if count > 1]
shared_size = len(shared_indices)

# 自定义环境类，包含 KPI 计算逻辑
class FactoryEnv(gym.Env):
    def __init__(self, state_size, action_size, A, B):
        super(FactoryEnv, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.A = A
        self.B = B
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(state_size,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1, high=1, shape=(action_size,), dtype=np.float32)
        self.state = np.random.uniform(-1, 1, size=(state_size,))
        
    def step(self, action):
        self.state = np.clip(self.state + action, -1, 1)
        kpi1 = np.dot(self.A[0], self.state) + self.B[0]
        kpi2 = np.dot(self.A[1], self.state) + self.B[1]
        kpi3 = np.dot(self.A[2], self.state) + self.B[2]
        reward_vector = np.array([kpi1, kpi2, kpi3])  # 多目标奖励
        done = False  # 根据需要设置终止条件
        return self.state, reward_vector, done, {}
    
    def reset(self):
        self.state = np.random.uniform(-1, 1, size=(self.state_size,))
        return self.state

# 定义 Actor 和 Critic 网络
class SharedActor(nn.Module):
    def __init__(self, state_size, action_size, shared_size, weight_size):
        super(SharedActor, self).__init__()
        self.shared_fc = nn.Linear(shared_size, 128)
        self.individual_fc = nn.Linear(state_size - shared_size, 128)
        self.fc1 = nn.Linear(256 + weight_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, state, weights):
        shared_features = torch.relu(self.shared_fc(state[:, shared_indices]))
        individual_features = torch.relu(self.individual_fc(state[:, [i for i in range(state.size(1)) if i not in shared_indices]]))
        combined = torch.cat([shared_features, individual_features, weights], dim=1)
        x = torch.relu(self.fc1(combined))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

class SharedCritic(nn.Module):
    def __init__(self, state_size, action_size, shared_size, weight_size):
        super(SharedCritic, self).__init__()
        self.shared_fc = nn.Linear(shared_size, 128)
        self.individual_fc = nn.Linear(state_size - shared_size, 128)
        self.fc1 = nn.Linear(256 + action_size + weight_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, state, action, weights):
        shared_features = torch.relu(self.shared_fc(state[:, shared_indices]))
        individual_features = torch.relu(self.individual_fc(state[:, [i for i in range(state.size(1)) if i not in shared_indices]]))
        combined = torch.cat([shared_features, individual_features, action, weights], dim=1)
        x = torch.relu(self.fc1(combined))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# 定义 MORL 智能体
class MORLAgent:
    def __init__(self, state_size, action_size, shared_size, weight_size):
        self.state_size = state_size
        self.action_size = action_size
        self.weight_size = weight_size
        self.actor = SharedActor(state_size, action_size, shared_size, weight_size)
        self.critic = SharedCritic(state_size, action_size, shared_size, weight_size)
        self.target_actor = SharedActor(state_size, action_size, shared_size, weight_size)
        self.target_critic = SharedCritic(state_size, action_size, shared_size, weight_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.002)
        self.memory = deque(maxlen=100000)
        self.batch_size = 128
        self.gamma = 0.99
        self.tau = 0.005
        
        # 初始化目标网络参数
        self._update_target(self.target_actor, self.actor, 1.0)
        self._update_target(self.target_critic, self.critic, 1.0)

    def _update_target(self, target, source, tau):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

    def act(self, state, weights):
        state = torch.FloatTensor(state).unsqueeze(0)
        weights = torch.FloatTensor(weights).unsqueeze(0)
        with torch.no_grad():
            action = self.actor(state, weights).squeeze(0).numpy()
        return np.clip(action + np.random.normal(0, 0.1, size=self.action_size), -1, 1)

    def remember(self, state, action, reward_vector, next_state, done, weights):
        weighted_reward = np.dot(weights, reward_vector)
        self.memory.append((state, action, weighted_reward, next_state, done, weights))

    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones, weights = zip(*batch)
        
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones).unsqueeze(1)
        weights = torch.FloatTensor(weights)
        
        # Critic 更新
        next_actions = self.target_actor(next_states, weights)
        next_q_values = self.target_critic(next_states, next_actions, weights)
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        q_values = self.critic(states, actions, weights)
        critic_loss = nn.MSELoss()(q_values, q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # Actor 更新
        actions_pred = self.actor(states, weights)
        actor_loss = -self.critic(states, actions_pred, weights).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # 软更新目标网络
        self._update_target(self.target_actor, self.actor, self.tau)
        self._update_target(self.target_critic, self.critic, self.tau)

# 初始化环境和智能体
env = FactoryEnv(state_size=73, action_size=73, A=A, B=B)
agent = MORLAgent(state_size=73, action_size=73, shared_size=len(shared_indices), weight_size=3)

# 开始训练
episodes = 1000
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    weights = np.random.dirichlet(np.ones(3), size=1)[0]
    while not done:
        action = agent.act(state, weights)
        next_state, reward_vector, done, _ = env.step(action)
        agent.remember(state, action, reward_vector, next_state, done, weights)
        agent.learn()
        state = next_state
        episode_reward += np.dot(weights, reward_vector)
    print(f"Episode {episode + 1}, Weighted Reward: {episode_reward}")


# 多头注意力机制

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import gym
from gym import spaces

# 自定义多头注意力模块
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)

    def forward(self, x):
        x = x.unsqueeze(0)  # 增加批次维度
        attn_output, _ = self.attention(x, x, x)  # 自注意力
        return attn_output.squeeze(0)  # 移除批次维度

# Actor 网络，带多头注意力机制
class AttentionActor(nn.Module):
    def __init__(self, state_size, action_size, shared_size, weight_size, embed_dim=128, num_heads=4):
        super(AttentionActor, self).__init__()
        self.shared_attention = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)
        self.individual_fc = nn.Linear(state_size - shared_size, embed_dim)
        self.fc1 = nn.Linear(2 * embed_dim + weight_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, state, weights):
        shared_features = state[:, shared_indices]
        shared_features = self.shared_attention(shared_features)
        individual_features = torch.relu(self.individual_fc(state[:, [i for i in range(state.size(1)) if i not in shared_indices]]))
        combined = torch.cat([shared_features, individual_features, weights], dim=1)
        x = torch.relu(self.fc1(combined))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

# Critic 网络，带多头注意力机制
class AttentionCritic(nn.Module):
    def __init__(self, state_size, action_size, shared_size, weight_size, embed_dim=128, num_heads=4):
        super(AttentionCritic, self).__init__()
        self.shared_attention = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)
        self.individual_fc = nn.Linear(state_size - shared_size, embed_dim)
        self.fc1 = nn.Linear(2 * embed_dim + action_size + weight_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, state, action, weights):
        shared_features = state[:, shared_indices]
        shared_features = self.shared_attention(shared_features)
        individual_features = torch.relu(self.individual_fc(state[:, [i for i in range(state.size(1)) if i not in shared_indices]]))
        combined = torch.cat([shared_features, individual_features, action, weights], dim=1)
        x = torch.relu(self.fc1(combined))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# 定义 MORL 智能体
class MORLAgent:
    def __init__(self, state_size, action_size, shared_size, weight_size):
        self.state_size = state_size
        self.action_size = action_size
        self.weight_size = weight_size
        self.actor = AttentionActor(state_size, action_size, shared_size, weight_size)
        self.critic = AttentionCritic(state_size, action_size, shared_size, weight_size)
        self.target_actor = AttentionActor(state_size, action_size, shared_size, weight_size)
        self.target_critic = AttentionCritic(state_size, action_size, shared_size, weight_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.002)
        self.memory = deque(maxlen=100000)
        self.batch_size = 128
        self.gamma = 0.99
        self.tau = 0.005
        
        # 初始化目标网络参数
        self._update_target(self.target_actor, self.actor, 1.0)
        self._update_target(self.target_critic, self.critic, 1.0)

    def _update_target(self, target, source, tau):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

    def act(self, state, weights):
        state = torch.FloatTensor(state).unsqueeze(0)
        weights = torch.FloatTensor(weights).unsqueeze(0)
        with torch.no_grad():
            action = self.actor(state, weights).squeeze(0).numpy()
        return np.clip(action + np.random.normal(0, 0.1, size=self.action_size), -1, 1)

    def remember(self, state, action, reward_vector, next_state, done, weights):
        weighted_reward = np.dot(weights, reward_vector)
        self.memory.append((state, action, weighted_reward, next_state, done, weights))

    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones, weights = zip(*batch)
        
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones).unsqueeze(1)
        weights = torch.FloatTensor(weights)
        
        # Critic 更新
        next_actions = self.target_actor(next_states, weights)
        next_q_values = self.target_critic(next_states, next_actions, weights)
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        q_values = self.critic(states, actions, weights)
        critic_loss = nn.MSELoss()(q_values, q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # Actor 更新
        actions_pred = self.actor(states, weights)
        actor_loss = -self.critic(states, actions_pred, weights).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # 软更新目标网络
        self._update_target(self.target_actor, self.actor, self.tau)
        self._update_target(self.target_critic, self.critic, self.tau)

# 初始化环境和智能体
env = FactoryEnv(state_size=73, action_size=73, A=A, B=B)
agent = MORLAgent(state_size=73, action_size=73, shared_size=len(shared_indices), weight_size=3)

# 开始训练
episodes = 1000
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    weights = np.random.dirichlet(np.ones(3), size=1)[0]
    while not done:
        action = agent.act(state, weights)
        next_state, reward_vector, done, _ = env.step(action)
        agent.remember(state, action, reward_vector, next_state, done, weights)
        agent.learn()
        state = next_state
        episode_reward += np.dot(weights, reward_vector)
    print(f"Episode {episode + 1}, Weighted Reward: {episode_reward}")
