In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch import amp
import random
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import os
import math
import json
from datetime import datetime, timedelta

# CUDA 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 중인 디바이스: {device}")

# 하이퍼파라미터 설정 (Rainbow DQN)
GAMMA = 0.99
LR = 1e-4
BATCH_SIZE = 256
MEMORY_SIZE = 10000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 500
TARGET_UPDATE = 10
ALPHA = 0.6  # Prioritized Replay의 alpha
BETA_START = 0.4
BETA_FRAMES = 1000
tidal_data_cache = {}

# 1. 그리드 맵 로드
grid = np.load(r'C:/baramproject/sibal/land_sea_grid_cartopy_downsized.npy')
n_rows, n_cols = grid.shape  # 270x236

# 2. 좌표 변환 함수
def latlon_to_grid(lat, lon, lat_min=30, lat_max=38, lon_min=120, lon_max=127):
    row = int((lat_max - lat) / (lat_max - lat_min) * n_rows)
    col = int((lon - lon_min) / (lon_max - lon_min) * n_cols)
    return min(max(row, 0), n_rows-1), min(max(col, 0), n_cols-1)

# 시작점과 도착점 설정
start_lat, start_lon = 37.46036, 126.52360  # 인천항
end_lat, end_lon = 30.62828, 122.06400     # 상하이항
start_pos = latlon_to_grid(start_lat, start_lon)
end_pos = latlon_to_grid(end_lat, end_lon)

# 3. 유클리드 거리 계산 함수
def euclidean_distance(pos1, pos2):
    return np.sqrt((pos1[0] - pos2[0])**2 + (pos1[1] - pos2[1])**2)

# 4. 조류 데이터 로드 함수
def load_tidal_data(time_str):
    if time_str in tidal_data_cache:
        return tidal_data_cache[time_str]
    file_path = f"C:/baramproject/tidal_database/tidal_{time_str}.json"
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"파일 {file_path}가 존재하지 않습니다.")
    with open(file_path, 'r') as f:
        data = json.load(f)
    tidal_data_cache[time_str] = data['result']['data']
    return tidal_data_cache[time_str]

# 5. 조류 데이터를 그리드에 매핑
def map_tidal_to_grid(tidal_data, n_rows, n_cols):
    current_grid = np.zeros((n_rows, n_cols, 2))  # [방향, 속도]
    for entry in tidal_data:
        lat = float(entry['pre_lat'])
        lon = float(entry['pre_lon'])
        row, col = latlon_to_grid(lat, lon)
        if 0 <= row < n_rows and 0 <= col < n_cols and grid[row, col] == 0:
            direction = float(entry['current_dir'])
            speed = float(entry['current_speed'])
            current_grid[row, col] = [direction, speed]
    return current_grid

# 6. 환경 클래스 정의
class NavigationEnv:
    def __init__(self, grid, start_pos, end_pos, tidal_database_path, max_steps=300, step_time_minutes=12):
        self.grid = grid
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.current_pos = start_pos
        self.visit_count = {}
        self.prev_action = None
        self.tidal_database_path = tidal_database_path
        self.max_steps = max_steps
        self.step_time_minutes = step_time_minutes
        self.current_time = None
        self.current_tidal_data = None
        self.cumulative_time = 0

    def reset(self, start_time=None):
        if start_time is None:
            start_time = self._random_start_time()
        self.current_pos = self.start_pos
        self.visit_count = {}
        self.prev_action = None
        self.current_time = start_time
        self.cumulative_time = 0
        tidal_data = load_tidal_data(self.current_time.strftime("%Y%m%d_%H%M"))
        self.current_tidal_data = map_tidal_to_grid(tidal_data, n_rows, n_cols)
        return self.get_state()

    def _random_start_time(self):
        start_datetime = datetime(2018, 1, 1, 0, 0)
        end_datetime = datetime(2018, 12, 28, 0, 0)
        total_minutes = int((end_datetime - start_datetime).total_seconds() / 60)
        random_minutes = random.randint(0, total_minutes // 30) * 30
        return start_datetime + timedelta(minutes=random_minutes)

    def get_state(self):
        rel_row = (self.current_pos[0] - self.start_pos[0]) / n_rows
        rel_col = (self.current_pos[1] - self.start_pos[1]) / n_cols
        dist_to_end = euclidean_distance(self.current_pos, self.end_pos) / max(n_rows, n_cols)
        dx = self.end_pos[0] - self.current_pos[0]
        dy = self.end_pos[1] - self.current_pos[1]
        angle_to_goal = math.atan2(dy, dx) / math.pi

        directions = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]
        tidal_dirs = []
        tidal_speeds = []
        for d in directions:
            r = self.current_pos[0] + d[0]
            c = self.current_pos[1] + d[1]
            if 0 <= r < n_rows and 0 <= c < n_cols:
                tidal_dirs.append(self.current_tidal_data[r, c, 0] / 360.0)
                tidal_speeds.append(self.current_tidal_data[r, c, 1] / 10.0)
            else:
                tidal_dirs.append(0.0)
                tidal_speeds.append(0.0)

        return np.concatenate(([rel_row, rel_col, dist_to_end, angle_to_goal], tidal_dirs, tidal_speeds))

    def get_action_deltas(self):
        dx = self.end_pos[0] - self.current_pos[0]
        dy = self.end_pos[1] - self.current_pos[1]
        angle_to_goal = math.atan2(dy, dx)
        angles = [0, math.pi/4, -math.pi/4, math.pi/2, -math.pi/2, 3*math.pi/4, -3*math.pi/4, math.pi]
        action_deltas = []
        for angle_offset in angles:
            target_angle = angle_to_goal + angle_offset
            delta_row = round(-math.cos(target_angle))
            delta_col = round(math.sin(target_angle))
            action_deltas.append((delta_row, delta_col))
        return action_deltas

    def step(self, action):
        action_deltas = self.get_action_deltas()
        move = action_deltas[action]
        new_pos = (self.current_pos[0] + move[0], self.current_pos[1] + move[1])

        reward = 0
        done = False
        if (0 <= new_pos[0] < n_rows and 0 <= new_pos[1] < n_cols and self.grid[new_pos] == 0):
            prev_dist = euclidean_distance(self.current_pos, self.end_pos)
            self.current_pos = new_pos
            new_dist = euclidean_distance(self.current_pos, self.end_pos)

            dist_change = prev_dist - new_dist
            reward += dist_change * 50
            reward -= 1

            row, col = self.current_pos
            current_dir = self.current_tidal_data[row, col, 0] * math.pi / 180
            current_speed = self.current_tidal_data[row, col, 1]
            move_angle = math.atan2(move[1], move[0]) if move != (0, 0) else 0
            angle_diff = abs((current_dir - move_angle + math.pi) % (2 * math.pi) - math.pi)
            if angle_diff < math.pi / 4:
                reward += current_speed * 2
            elif angle_diff > 3 * math.pi / 4:
                reward -= current_speed * 1.5

            if new_dist < 5:
                reward += 200 * (1 - new_dist / 5)
            if new_dist < 1:
                reward += 1000
                done = True

            pos_tuple = tuple(self.current_pos)
            self.visit_count[pos_tuple] = self.visit_count.get(pos_tuple, 0) + 1
            if self.visit_count[pos_tuple] > 1:
                reward -= 10 * self.visit_count[pos_tuple]

            self.prev_action = move
            self.cumulative_time += self.step_time_minutes
            if self.cumulative_time >= 30:
                self.current_time += timedelta(minutes=30)
                tidal_data = load_tidal_data(self.current_time.strftime("%Y%m%d_%H%M"))
                self.current_tidal_data = map_tidal_to_grid(tidal_data, n_rows, n_cols)
                self.cumulative_time -= 30
        else:
            reward = -50
            done = False

        return self.get_state(), reward, done

# 7. Rainbow DQN 모델 정의 (Dueling 구조 사용)
class DuelingDQN(nn.Module):
    def __init__(self, input_dim=20, output_dim=8):
        super(DuelingDQN, self).__init__()
        self.feature = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.value = nn.Linear(64, 1)
        self.advantage = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.feature(x)
        value = self.value(x)
        advantage = self.advantage(x)
        return value + (advantage - advantage.mean(dim=1, keepdim=True))

# 8. Prioritized Replay Memory (Rainbow DQN의 PER)
class PrioritizedReplay:
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.alpha = alpha
        self.memory = []
        self.priorities = []
        self.pos = 0

    def push(self, error, sample):
        if len(self.memory) < self.capacity:
            self.memory.append(sample)
            self.priorities.append(error)
        else:
            self.memory[self.pos] = sample
            self.priorities[self.pos] = error
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        priorities = np.array(self.priorities)
        probs = priorities ** self.alpha
        probs /= probs.sum()
        indices = np.random.choice(len(self.memory), BATCH_SIZE, p=probs)
        samples = [self.memory[idx] for idx in indices]
        weights = (len(self.memory) * probs[indices]) ** (-beta)
        weights /= weights.max()
        return samples, indices, weights

    def update_priorities(self, indices, errors):
        for idx, error in zip(indices, errors):
            self.priorities[idx] = error

    def __len__(self):
        return len(self.memory)

# 9. Rainbow DQN 에이전트
class RainbowAgent:
    def __init__(self, state_dim=20, action_dim=8):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.memory = PrioritizedReplay(MEMORY_SIZE, ALPHA)
        self.policy_net = DuelingDQN(state_dim, action_dim).to(device)
        self.target_net = DuelingDQN(state_dim, action_dim).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LR)
        self.steps = 0
        self.scaler = amp.GradScaler('cuda')

    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, self.action_dim - 1)
        else:
            with torch.no_grad():
                return self.policy_net(state).squeeze(0).argmax().item()

    def optimize_model(self, beta):
        if len(self.memory) < BATCH_SIZE:
            return
        samples, indices, weights = self.memory.sample(BATCH_SIZE, beta)
        states, actions, rewards, next_states, dones = zip(*samples)
        states = torch.stack(states).to(device).squeeze(1)
        actions = torch.LongTensor(actions).to(device).unsqueeze(1)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.stack(next_states).to(device).squeeze(1)
        dones = torch.FloatTensor(dones).to(device)
        weights = torch.FloatTensor(weights).to(device)

        self.optimizer.zero_grad()
        with amp.autocast('cuda'):  # Mixed Precision 연산 시작
            q_values = self.policy_net(states).gather(1, actions).squeeze(1)
            next_q_values = self.target_net(next_states).max(1)[0]
            targets = rewards + GAMMA * next_q_values * (1 - dones)
            loss = (weights * (q_values - targets) ** 2).mean()

        self.scaler.scale(loss).backward()  # 손실 스케일링 후 역전파
        self.scaler.step(self.optimizer)    # 옵티마이저 업데이트
        self.scaler.update()                # 스케일러 업데이트

        errors = (q_values - targets).abs().detach().cpu().numpy()
        self.memory.update_priorities(indices, errors)

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# 10. 학습 설정
tidal_database_path = r"C:\baramproject\tidal_database"
env = NavigationEnv(grid, start_pos, end_pos, tidal_database_path, max_steps=300)
agent = RainbowAgent(state_dim=20, action_dim=8)

# 모델 로드 또는 새로 초기화
model_path = r'C:\baramproject\trained_model\sibal16\navigation_model.pth'
if os.path.exists(model_path):
    try:
        agent.policy_net.load_state_dict(torch.load(model_path))
        agent.target_net.load_state_dict(agent.policy_net.state_dict())
        print(f"기존 모델 '{model_path}'를 불러왔습니다. 재학습을 시작합니다.")
        epsilon = 0.3
        num_episodes = 300
    except Exception as e:
        print(f"모델 로드 실패: {e}. 새로 학습을 시작합니다.")
        agent.target_net.load_state_dict(agent.policy_net.state_dict())
        epsilon = 1.0
        num_episodes = 2000
else:
    print(f"모델 파일 '{model_path}'가 없습니다. 새로 학습을 시작합니다.")
    agent.target_net.load_state_dict(agent.policy_net.state_dict())
    epsilon = EPSILON_START
    num_episodes = 20000

# 이미지 저장 디렉토리 설정
save_dir = r'C:\baramproject\trained_model\sibal16\episode_debug'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"디렉토리 '{save_dir}'를 생성했습니다.")

# 11. 학습 루프
rewards = []
path_lengths = []
progress_bar = tqdm(range(num_episodes), desc="학습 진행률")
beta = BETA_START

for episode in progress_bar:
    state = env.reset()
    state = torch.FloatTensor(state).to(device).unsqueeze(0)
    total_reward = 0
    done = False
    path = [env.current_pos]
    steps = 0

    while not done and steps < env.max_steps:
        action = agent.select_action(state, epsilon)
        next_state, reward, done = env.step(action)
        next_state = torch.FloatTensor(next_state).to(device).unsqueeze(0)
        total_reward += reward
        path.append(env.current_pos)

        with torch.no_grad():
            q_value = agent.policy_net(state)[0, action]
            next_q_value = agent.target_net(next_state).max(dim=1)[0]
            target = reward + GAMMA * next_q_value * (1 - done)
            error = abs(q_value - target).item()
        agent.memory.push(error, (state, action, reward, next_state, done))
        state = next_state

        beta = min(1.0, BETA_START + steps * (1.0 - BETA_START) / BETA_FRAMES)
        agent.optimize_model(beta)

        if steps % TARGET_UPDATE == 0:
            agent.update_target()

        steps += 1

    epsilon = max(EPSILON_END, epsilon - (EPSILON_START - EPSILON_END) / EPSILON_DECAY)
    rewards.append(total_reward)
    path_lengths.append(len(path))
    progress_bar.set_postfix({'Reward': total_reward, 'Path Length': len(path)})

    if episode % 100 == 0 and episode > 0:
        print(f"Episode {episode} - 중간 경로 시각화 (Start: {start_pos}, End: {end_pos})")
        plt.figure(figsize=(10, 8))
        plt.imshow(grid, cmap='gray', origin='upper')
        path_rows = [pos[0] for pos in path]
        path_cols = [pos[1] for pos in path]
        plt.plot(path_cols, path_rows, 'r-', linewidth=2)
        plt.plot(start_pos[1], start_pos[0], 'go')
        plt.plot(end_pos[1], end_pos[0], 'bo')
        plt.title(f"Path Visualization at Episode {episode}")
        save_path = os.path.join(save_dir, f"episode2_{episode}.png")
        plt.savefig(save_path)
        print(f"이미지 저장: {save_path}")
        plt.close()

# 12. 모델 저장
torch.save(agent.policy_net.state_dict(), r'C:\baramproject\trained_model\sibal16\navigation_model.pth')
print("모델이 'navigation_model.pth' 파일로 저장되었습니다.")

# 13. 학습 결과 시각화
plt.figure(figsize=(10, 5))
plt.plot(rewards, label='Total Reward')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Learning Reward Graph')
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(path_lengths, label='Path Length')
plt.xlabel('Episode')
plt.ylabel('Path Length')
plt.title('Learning Path Length Graph')
plt.legend()
plt.show()

# 14. 최적 경로 시각화 함수
def plot_path(path, start_pos, end_pos, grid):
    path_rows = [pos[0] for pos in path]
    path_cols = [pos[1] for pos in path]
    plt.figure(figsize=(10, 8))
    plt.imshow(grid, cmap='gray', origin='upper')
    plt.plot(path_cols, path_rows, 'r-', linewidth=2, label='Optimal Path')
    plt.plot(start_pos[1], start_pos[0], 'go', label=f'Start {start_pos}')
    plt.plot(end_pos[1], end_pos[0], 'bo', label=f'End {end_pos}')
    lon_ticks = np.linspace(0, n_cols, 8)
    lon_labels = np.linspace(120, 127, 8)
    plt.xticks(lon_ticks, [f"{lon:.1f}°E" for lon in lon_labels])
    lat_ticks = np.linspace(0, n_rows, 9)
    lat_labels = np.linspace(38, 30, 9)
    plt.yticks(lat_ticks, [f"{lat:.1f}°N" for lat in lat_labels])
    plt.xlabel('Longitude Grid')
    plt.ylabel('Latitude Grid')
    plt.title('Optimal Navigation Path')
    plt.legend()
    plt.grid(True)
    plt.show()

# 최적 경로 시각화
plot_path(path, start_pos, end_pos, grid)

사용 중인 디바이스: cuda
모델 파일 'C:\baramproject\trained_model\sibal16\navigation_model.pth'가 없습니다. 새로 학습을 시작합니다.


학습 진행률:   0%|          | 0/20000 [00:00<?, ?it/s]

Episode 100 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_100.png
Episode 200 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_200.png
Episode 300 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_300.png
Episode 400 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_400.png
Episode 500 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_500.png
Episode 600 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_600.png
Episode 700 - 중간 경로 시각화 (Start: (18, 219), End: (248, 69))
이미지 저장: C:\baramproject\trained_model\sibal16\episode_debug\episode2_700.png
Episode 800 - 중간 경로 시각화 (Start: (18, 219), End: 

  probs /= probs.sum()


ValueError: probabilities contain NaN