In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import matplotlib.pyplot as plt

class EarthquakeEscapeEnv:
    def __init__(self, width=5, height=5, depth=3):
        self.width = width
        self.height = height
        self.depth = depth
        self.building = np.zeros((depth, height, width), dtype=int)

        # Define obstacles
        self.building[0, 2, 2] = -1  # Obstacle at (0, 2, 2)
        self.building[1, 1, 1] = -1  # Obstacle at (1, 1, 1)
        self.building[2, 3, 3] = -1  # Obstacle at (2, 3, 3)

        # Define the exit location
        self.exit_location = (0, 0, 0)  # Exit at (0, 0, 0)

        # Initial position of the agent
        self.start_location = (2, 4, 4)  # Start at (2, 4, 4)
        self.agent_position = self.start_location

    def reset(self):
        self.agent_position = self.start_location
        return self.agent_position

    def step(self, action):
        z, y, x = self.agent_position

        if action == 0:  # move up
            y = max(0, y - 1)
        elif action == 1:  # move down
            y = min(self.height - 1, y + 1)
        elif action == 2:  # move left
            x = max(0, x - 1)
        elif action == 3:  # move right
            x = min(self.width - 1, x + 1)
        elif action == 4:  # move up a floor
            z = max(0, z - 1)
        elif action == 5:  # move down a floor
            z = min(self.depth - 1, z + 1)

        # Check bounds and obstacles
        if 0 <= x < self.width and 0 <= y < self.height and 0 <= z < self.depth:
            if self.building[z, y, x] != -1:
                self.agent_position = (z, y, x)

        reward = -1  # Default step penalty
        done = False

        if self.agent_position == self.exit_location:
            reward = 100  # 출구에 도착하면 높은 보상 부여
            done = True
        else:
            # 에이전트가 출구에 가까워질수록 보상이 증가하도록 함
            distance_to_exit = abs(self.agent_position[0] - self.exit_location[0]) + \
                                abs(self.agent_position[1] - self.exit_location[1]) + \
                                abs(self.agent_position[2] - self.exit_location[2])
            reward = -distance_to_exit  # 출구에 가까울수록 높은 보상 부여
            done = False

        return self.agent_position, reward, done




class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.softmax(x, dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self, input_size):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class PPOAgent:
    def __init__(self, env, gamma=0.99, lr=3e-4, clip_eps=0.2, update_steps=10):
        self.env = env
        self.gamma = gamma
        self.clip_eps = clip_eps
        self.update_steps = update_steps

        self.policy_net = PolicyNetwork(3, 6).cuda()
        self.value_net = ValueNetwork(3).cuda()

        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)
    
    def select_action(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).cuda()
        with torch.no_grad():  # 그래디언트 추적 비활성화
            probs = self.policy_net(state_tensor)
            dist = Categorical(probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
        return action.item(), log_prob

    def update(self, states, actions, log_probs, returns, advantages):
        for _ in range(self.update_steps):
            idxs = np.arange(len(states))
            np.random.shuffle(idxs)
            for i in idxs:
                state = torch.tensor(states[i], dtype=torch.float32).cuda()
                action = torch.tensor(actions[i]).cuda()
                old_log_prob = log_probs[i].cuda()
                R = returns[i]
                advantage = advantages[i]

                prob = self.policy_net(state.unsqueeze(0))
                dist = Categorical(prob)
                new_log_prob = dist.log_prob(action)

                ratio = torch.exp(new_log_prob - old_log_prob)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * advantage

                policy_loss = -(torch.min(surr1, surr2)).mean()
                value_loss = ((self.value_net(state.unsqueeze(0)) - R).pow(2)).mean()

                self.policy_optimizer.zero_grad()
                policy_loss.backward()
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                value_loss.backward()
                self.value_optimizer.step()



    def compute_returns(self, rewards, dones):
        returns = []
        R = 0
        for r, done in zip(reversed(rewards), reversed(dones)):
            if done:
                R = 0
            R = r + self.gamma * R
            returns.insert(0, R)
        return returns

    
    def train(self, num_episodes):
        episode_rewards, episode_lengths = [], []

        for episode in range(num_episodes):
            state = env.reset()
            episode_reward = 0
            episode_length = 0
        
            # 추가된 로깅: 에피소드 시작 메시지 출력
            print(f"Episode {episode + 1} started.")
        
            done = False
            while not done:
                action, log_prob = agent.select_action(state)
                next_state, reward, done = env.step(action)
        
        
                # 학습에 필요한 데이터 수집
                episode_reward += reward
                episode_length += 1
        
                # 상태 업데이트
                state = next_state
        
            # 추가된 로깅: 에피소드 종료 메시지 및 결과 출력
            print(f"Episode {episode + 1} finished. Total reward: {episode_reward}, Length: {episode_length}")
        
            # 에피소드별 보상 및 길이 저장
            episode_rewards.append(episode_reward)
            episode_lengths.append(episode_length)
    
        return episode_rewards, episode_lengths

In [6]:
import os

# 학습 및 성능 평가
env = EarthquakeEscapeEnv()
agent = PPOAgent(env)

# Anomaly detection 활성화
torch.autograd.set_detect_anomaly(True)
episode_rewards, episode_lengths = agent.train(num_episodes=1000)


# 학습 결과 저장
torch.save(agent.policy_net.state_dict(), 'policy_net.pth')
torch.save(agent.value_net.state_dict(), 'value_net.pth')

# 학습 과정 로깅
print("Training completed. Saving models and plotting training results...")

# 성능 비교 시각화
plt.figure(figsize=(10, 5))

# 에피소드별 보상 그래프
plt.subplot(1, 2, 1)
plt.plot(episode_rewards, label='Training')
plt.title('Episode Rewards')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.legend()

# 에피소드별 길이 그래프
plt.subplot(1, 2, 2)
plt.plot(episode_lengths, label='Training')
plt.title('Episode Lengths')
plt.xlabel('Episode')
plt.ylabel('Length')
plt.legend()

plt.tight_layout()
plt.show()

# 이전 체크포인트 로드
policy_net = PolicyNetwork(3, 6).cuda()
value_net = ValueNetwork(3).cuda()

policy_net.load_state_dict(torch.load('policy_net.pth'))
value_net.load_state_dict(torch.load('value_net.pth'))

# 성능 평가를 위한 에이전트 생성
eval_agent = PPOAgent(env)

# 성능 비교를 위한 변수 초기화
eval_episode_rewards = []
eval_episode_lengths = []

# 평가
print("Evaluating the trained model...")
for _ in range(100):  # 100 에피소드 동안 평가
    state = env.reset()
    total_reward = 0
    episode_length = 0
    done = False

    while not done:
        action, _ = eval_agent.select_action(state)
        next_state, reward, done = env.step(action)

        total_reward += reward
        episode_length += 1
        state = next_state

    eval_episode_rewards.append(total_reward)
    eval_episode_lengths.append(episode_length)

# 성능 비교 시각화
plt.figure(figsize=(10, 5))

# 에피소드별 보상 그래프
plt.subplot(1, 2, 1)
plt.plot(episode_rewards, label='Training')
plt.plot(eval_episode_rewards, label='Evaluation')
plt.title('Episode Rewards')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.legend()

# 에피소드별 길이 그래프
plt.subplot(1, 2, 2)
plt.plot(episode_lengths, label='Training')
plt.plot(eval_episode_lengths, label='Evaluation')
plt.title('Episode Lengths')
plt.xlabel('Episode')
plt.ylabel('Length')
plt.legend()

plt.tight_layout()
plt.show()


Episode 1 started.
Step: 1, State: (2, 4, 4), Action: 2, Reward: -9
Step: 2, State: (2, 4, 3), Action: 2, Reward: -8
Step: 3, State: (2, 4, 2), Action: 0, Reward: -7
Step: 4, State: (2, 3, 2), Action: 1, Reward: -8
Step: 5, State: (2, 4, 2), Action: 1, Reward: -8
Step: 6, State: (2, 4, 2), Action: 4, Reward: -7
Step: 7, State: (1, 4, 2), Action: 2, Reward: -6
Step: 8, State: (1, 4, 1), Action: 1, Reward: -6
Step: 9, State: (1, 4, 1), Action: 0, Reward: -5
Step: 10, State: (1, 3, 1), Action: 3, Reward: -6
Step: 11, State: (1, 3, 2), Action: 4, Reward: -5
Step: 12, State: (0, 3, 2), Action: 0, Reward: -5
Step: 13, State: (0, 3, 2), Action: 1, Reward: -6
Step: 14, State: (0, 4, 2), Action: 0, Reward: -5
Step: 15, State: (0, 3, 2), Action: 1, Reward: -6
Step: 16, State: (0, 4, 2), Action: 2, Reward: -5
Step: 17, State: (0, 4, 1), Action: 2, Reward: -4
Step: 18, State: (0, 4, 0), Action: 4, Reward: -4
Step: 19, State: (0, 4, 0), Action: 0, Reward: -3
Step: 20, State: (0, 3, 0), Action: 2, R

KeyboardInterrupt: 