In [2]:
import numpy as np
import json
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from torch.amp import autocast, GradScaler

# 하이퍼파라미터 정의
GRID_SIZE = 100
NUM_OBSTACLES = int(GRID_SIZE * GRID_SIZE * 0.00555)
LAT_RANGE = (33.0, 38.0)
LON_RANGE = (120.0, 127.0)
TRAINING_EPISODES = 5000
BATCH_SIZE = 256
GAMMA = 0.995
EPSILON_START = 1.0
EPSILON_END = 0.05
EPSILON_DECAY = 0.9995
LEARNING_RATE = 0.0001
SAVE_INTERVAL = 1000
MAX_STEPS = GRID_SIZE * 2

# 파일 경로
TIDAL_BASE_PATH = "C:/baramproject/tidal_database"
CHECKPOINT_PATH = "C:/baramproject/trained_model/sibal/checkpoint.pth"
LOAD_MODEL_PATH = None

# 시작점과 목표점 좌표
START_GOAL_COORDS = [
    ((37.460359, 126.623605), (33.56036, 120.57860)),
    ((33.56036, 120.57860), (37.460359, 126.623605))
]

# CUDA 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("using Cuda")
else:
    print("Cuda not available, using CPU")
torch.backends.cudnn.benchmark = True
scaler = GradScaler('cuda')

# 격자 맵 생성 및 유틸리티 함수 (변경 없음)
def create_grid_map(size=GRID_SIZE, num_obstacles=NUM_OBSTACLES):
    grid = np.zeros((size, size), dtype=np.float32)
    for _ in range(num_obstacles):
        x, y = random.randint(0, size-1), random.randint(0, size-1)
        grid[x, y] = 1
    return grid

def latlon_to_grid(lat, lon, lat_range=LAT_RANGE, lon_range=LON_RANGE, grid_size=GRID_SIZE):
    lat_min, lat_max = lat_range
    lon_min, lon_max = lon_range
    x = int((lon - lon_min) / (lon_max - lon_min) * (grid_size - 1))
    y = int((lat_max - lat) / (lat_max - lat_min) * (grid_size - 1))
    return x, y

directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']
action_to_direction = {i: d for i, d in enumerate(directions)}

def move(position, action):
    x, y = position
    moves = {0: (0, -1), 1: (1, -1), 2: (1, 0), 3: (1, 1),
             4: (0, 1), 5: (-1, 1), 6: (-1, 0), 7: (-1, -1)}
    dx, dy = moves[action]
    return x + dx, y + dy

def load_tidal_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['result']['data']

def get_random_tidal_file(base_path=TIDAL_BASE_PATH):
    start_date = datetime(2018, 1, 1, 0, 0)
    end_date = datetime(2018, 12, 31, 23, 30)
    total_half_hours = int((end_date - start_date).total_seconds() / 1800) + 1  # 1800초 = 30분
    random_idx = random.randint(0, total_half_hours - 1)
    random_date = start_date + timedelta(minutes=random_idx * 30)  # 30분 단위로 변경
    file_name = f"tidal_{random_date.strftime('%Y%m%d_%H%M')}.json"
    return os.path.join(base_path, file_name)

def get_tidal_effect(position, tidal_data, lat_range=LAT_RANGE, lon_range=LON_RANGE, grid_size=GRID_SIZE):
    x, y = position
    lat = lat_range[1] - (y / (grid_size - 1)) * (lat_range[1] - lat_range[0])
    lon = lon_range[0] + (x / (grid_size - 1)) * (lon_range[1] - lon_range[0])
    min_dist = float('inf')
    tidal_dir, tidal_speed = 0, 0
    for entry in tidal_data:
        entry_lat = float(entry['pre_lat'])
        entry_lon = float(entry['pre_lon'])
        dist = (lat - entry_lat) ** 2 + (lon - entry_lon) ** 2
        if dist < min_dist:
            min_dist = dist
            tidal_dir = float(entry['current_dir'])
            tidal_speed = float(entry['current_speed'])
    return tidal_dir / 360.0, tidal_speed / 100.0

def get_adjacent_tidal_info(position, tidal_data, grid_size=GRID_SIZE):
    x, y = position
    directions = [(-1, 0), (-1, 1), (0, 1), (1, 1), (1, 0), (1, -1), (0, -1), (-1, -1)]
    tidal_dirs = []
    tidal_speeds = []
    for dx, dy in directions:
        adj_pos = (x + dx, y + dy)
        if 0 <= adj_pos[0] < grid_size and 0 <= adj_pos[1] < grid_size:
            dir_, speed_ = get_tidal_effect(adj_pos, tidal_data)
            tidal_dirs.append(dir_)
            tidal_speeds.append(speed_)
        else:
            tidal_dirs.append(0.0)
            tidal_speeds.append(0.0)
    return tidal_dirs, tidal_speeds

def get_obstacle_info(position, grid, window_size=5):
    x, y = position
    obstacles = []
    for dx in range(-2, 3):
        for dy in range(-2, 3):
            if dx == 0 and dy == 0:
                continue
            adj_pos = (x + dx, y + dy)
            if 0 <= adj_pos[0] < GRID_SIZE and 0 <= adj_pos[1] < GRID_SIZE:
                obstacles.append(int(grid[adj_pos]))
            else:
                obstacles.append(1)
    return obstacles

def calculate_bearing_diff(current_dir, goal_pos, current_pos):
    dx_goal = goal_pos[0] - current_pos[0]
    dy_goal = goal_pos[1] - current_pos[1]
    bearing_to_goal = np.arctan2(dy_goal, dx_goal) * 180 / np.pi
    if bearing_to_goal < 0:
        bearing_to_goal += 360
    angle_diff = abs(current_dir * 45 - bearing_to_goal)
    angle_diff = min(angle_diff, 360 - angle_diff)
    return angle_diff / 180.0

# 보상 함수
def calculate_reward(grid, current_pos, action, next_pos, prev_action, tidal_data, goal, step_count, max_steps=MAX_STEPS):
    size = grid.shape[0]
    #충돌, 맵이탈
    if (next_pos[0] < 0 or next_pos[0] >= size or 
        next_pos[1] < 0 or next_pos[1] >= size or 
        grid[next_pos] == 1):
        return -50
    #목표도달
    if next_pos == goal:
        return 100
    #근방 장애물 패널티
    obstacle_penalty = 0
    for dx in [-1, 0, 1]:
        for dy in [-1, 0, 1]:
            if dx == 0 and dy == 0:
                continue
            adj_pos = (next_pos[0] + dx, next_pos[1] + dy)
            if 0 <= adj_pos[0] < size and 0 <= adj_pos[1] < size and grid[adj_pos] == 1:
                obstacle_penalty = -10
                break
    #방향전환 패널티
    direction_penalty = 0
    if prev_action is not None:
        angle_diff = abs(action - prev_action) * 45
        angle_diff = min(angle_diff, 360 - angle_diff)
        if angle_diff > 45:
            direction_penalty = -2
    #종료점까지 방향 보상
    bearing_diff = calculate_bearing_diff(action, goal, next_pos)
    bearing_reward = (1 - bearing_diff) * 10
    #조류 보상,패널티
    tidal_dir, tidal_speed = get_tidal_effect(next_pos, tidal_data)
    ship_dir = action * 45
    tidal_angle_diff = abs(tidal_dir - ship_dir)
    if tidal_angle_diff > 180:
        tidal_angle_diff = 360 - tidal_angle_diff
    tidal_reward = tidal_speed * np.cos(np.radians(tidal_angle_diff))
    #지속적인 거리감소에대한 보상
    current_distance = np.sqrt((current_pos[0] - goal[0])**2 + (current_pos[1] - goal[1])**2)
    next_distance = np.sqrt((next_pos[0] - goal[0])**2 + (next_pos[1] - goal[1])**2)
    distance_reward = (current_distance - next_distance) * 2
    #기본이동 패널티
    step_penalty = -0.1
    #시간초과 패널티
    timeout_penalty = -100 if step_count >= max_steps else 0
    #탐색 유도 보상
    exploration_bonus = 0.1 if random.random() < 0.1 else 0
    #종합 보상계산
    total_reward = (tidal_reward + bearing_reward + distance_reward + 
                    step_penalty + obstacle_penalty + direction_penalty + 
                    timeout_penalty + exploration_bonus)
    return total_reward

# Dueling DQN 모델 정의
class DuelingDQN(nn.Module):
    def __init__(self, num_actions=8):
        super(DuelingDQN, self).__init__()
        self.fc1 = nn.Linear(45, 512)
        self.dropout1 = nn.Dropout(0.05)
        self.fc2 = nn.Linear(512, 512)
        self.dropout2 = nn.Dropout(0.05)
        self.fc3 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.05)
        self.value_stream = nn.Linear(256, 1)
        self.advantage_stream = nn.Linear(256, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        value = self.value_stream(x)
        advantages = self.advantage_stream(x)
        q_values = value + (advantages - advantages.mean(dim=1, keepdim=True))
        return q_values

# Prioritized Replay Buffer 정의
class PrioritizedReplayBuffer:
    def __init__(self, capacity=50000, alpha=0.6):
        self.capacity = capacity
        self.buffer = []
        self.priorities = []
        self.pos = 0
        self.alpha = alpha
        self.max_priority = 1.0

    def add(self, experience):
        if len(self.buffer) < self.capacity:
            self.buffer.append(experience)
            self.priorities.append(self.max_priority)
        else:
            self.buffer[self.pos] = experience
            self.priorities[self.pos] = self.max_priority
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        priorities = np.array(self.priorities, dtype=np.float32)
        probs = priorities ** self.alpha
        probs /= probs.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]
        weights = (len(self.buffer) * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = torch.FloatTensor(weights).to(device)

        states, actions, rewards, next_states, dones = zip(*samples)
        states = torch.FloatTensor(np.array(states)).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(np.array(next_states)).to(device)
        # dones를 float로 명시적으로 변환
        dones = torch.FloatTensor([float(d) for d in dones]).to(device)

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, indices, td_errors):
        for idx, td_error in zip(indices, td_errors):
            self.priorities[idx] = min(abs(td_error) + 1e-5, self.max_priority)
            self.max_priority = max(self.max_priority, self.priorities[idx])

    def __len__(self):
        return len(self.buffer)

# 학습 함수
def train_dqn(episodes=TRAINING_EPISODES, batch_size=BATCH_SIZE, gamma=GAMMA, 
              epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, 
              epsilon_decay=EPSILON_DECAY, save_interval=SAVE_INTERVAL, 
              tidal_base_path=TIDAL_BASE_PATH, load_model_path=LOAD_MODEL_PATH, 
              checkpoint_path=CHECKPOINT_PATH):
    model = DuelingDQN().to(device)
    target_model = DuelingDQN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    replay_buffer = PrioritizedReplayBuffer()
    start_episode = 0
    epsilon = epsilon_start
    total_rewards = []
    success_rates = []

    # 모델 로드
    if load_model_path and os.path.exists(load_model_path):
        checkpoint = torch.load(load_model_path)
        if 'model_state_dict' in checkpoint:
            model.load_state_dict(checkpoint['model_state_dict'])
            target_model.load_state_dict(checkpoint['target_model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_episode = checkpoint['episode']
            epsilon = checkpoint['epsilon']
            replay_buffer.buffer = checkpoint['replay_buffer']
            print(f"{load_model_path}에서 체크포인트 로드. {start_episode} 에피소드부터 시작.")
        else:
            model.load_state_dict(checkpoint)
            target_model.load_state_dict(checkpoint)
            start_episode = 10000
            epsilon = EPSILON_END
            print(f"{load_model_path}에서 가중치 로드. {start_episode} 에피소드부터 시작.")

    pbar = tqdm(range(start_episode, episodes), desc="Training Progress")

    for episode in pbar:
        tidal_file = get_random_tidal_file(tidal_base_path)
        tidal_data = load_tidal_data(tidal_file)
        grid = create_grid_map()
        start_latlon, goal_latlon = START_GOAL_COORDS[episode % 2]
        start = latlon_to_grid(start_latlon[0], start_latlon[1])
        goal = latlon_to_grid(goal_latlon[0], goal_latlon[1])
        grid[start] = 0
        grid[goal] = 0
        
        # 초기 상태 정의 (45차원)
        tidal_dir, tidal_speed = get_tidal_effect(start, tidal_data)
        tidal_dirs, tidal_speeds = get_adjacent_tidal_info(start, tidal_data)
        obstacle_info = get_obstacle_info(start, grid)
        bearing_diff = calculate_bearing_diff(0, goal, start)
        state = [
            start[0] / GRID_SIZE, start[1] / GRID_SIZE,
            goal[0] / GRID_SIZE, goal[1] / GRID_SIZE,
            bearing_diff,
        ] + tidal_dirs + tidal_speeds + obstacle_info
        
        prev_action = None
        done = False
        total_reward = 0
        success = 0
        step_count = 0
        
        while not done and step_count < MAX_STEPS:
            if random.random() < epsilon:
                action = random.randint(0, 7)
            else:
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                with autocast('cuda'):
                    q_values = model(state_tensor)
                action = q_values.argmax().item()

            next_pos = move((int(state[0] * GRID_SIZE), int(state[1] * GRID_SIZE)), action)
            reward = calculate_reward(grid, (int(state[0] * GRID_SIZE), int(state[1] * GRID_SIZE)), action, next_pos, prev_action, tidal_data, goal, step_count)
            total_reward += reward
            if next_pos == goal:
                success = 1
            
            # 다음 상태 정의 (45차원)
            tidal_dir, tidal_speed = get_tidal_effect(next_pos, tidal_data)
            tidal_dirs, tidal_speeds = get_adjacent_tidal_info(next_pos, tidal_data)
            obstacle_info = get_obstacle_info(next_pos, grid)
            bearing_diff = calculate_bearing_diff(action, goal, next_pos)
            next_state = [
                next_pos[0] / GRID_SIZE, next_pos[1] / GRID_SIZE,
                goal[0] / GRID_SIZE, goal[1] / GRID_SIZE,
                bearing_diff,
            ] + tidal_dirs + tidal_speeds + obstacle_info
            
            done = (next_pos == goal) or (step_count >= MAX_STEPS) or reward <= -200

            replay_buffer.add((state, action, reward, next_state, done))
            state = next_state
            prev_action = action
            step_count += 1

            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones, indices, weights = replay_buffer.sample(batch_size)
                optimizer.zero_grad()
                with autocast('cuda'):
                    q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                    with torch.no_grad():
                        # Double DQN: 온라인 네트워크로 행동 선택, 타겟 네트워크로 Q값 평가
                        next_actions = model(next_states).argmax(1, keepdim=True)
                        next_q_values = target_model(next_states).gather(1, next_actions).squeeze(1)
                        targets = rewards + gamma * next_q_values * (1 - dones)
                    td_errors = (q_values - targets).abs().cpu().detach().numpy()
                    loss = (weights * (q_values - targets) ** 2).mean()
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                replay_buffer.update_priorities(indices, td_errors)

        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        if episode % 50 == 0:
            target_model.load_state_dict(model.state_dict())
        if episode % save_interval == 0 and episode > 0:
            os.makedirs("C:/baramproject/trained_model/sibal/models", exist_ok=True)
            torch.save(model.state_dict(), f"C:/baramproject/trained_model/sibal/models/dqn_model_{episode}.pth")
        
        total_rewards.append(total_reward)
        success_rates.append(success)
        pbar.set_postfix({"Reward": f"{total_reward:.2f}", "Success": success})

    # 최종 모델 및 체크포인트 저장
    torch.save(model.state_dict(), "C:/baramproject/trained_model/sibal/models/dqn_model_final.pth")
    print(f"최종 모델 저장: C:/baramproject/trained_model/sibal/models/dqn_model_final.pth")
    checkpoint = {
        'episode': episodes,
        'model_state_dict': model.state_dict(),
        'target_model_state_dict': target_model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epsilon': epsilon,
        'replay_buffer': replay_buffer.buffer
    }
    torch.save(checkpoint, CHECKPOINT_PATH)
    print(f"체크포인트 저장: {CHECKPOINT_PATH}")

    # 학습 진행 그래프
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 1, 1)
    plt.plot(total_rewards, label="Total Reward")
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Training Progress - Total Reward per Episode')
    plt.grid(True)
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.plot(success_rates, label="Success Rate", color='green')
    plt.xlabel('Episode')
    plt.ylabel('Success (1=Yes, 0=No)')
    plt.title('Training Progress - Goal Success Rate')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return model

if __name__ == "__main__":
    trained_model = train_dqn(load_model_path=LOAD_MODEL_PATH)

using Cuda


Training Progress:   2%|▍                                | 75/5000 [03:04<3:22:12,  2.46s/it, Reward=592.18, Success=0]


KeyboardInterrupt: 