In [3]:
import numpy as np
import torch
import torch.nn as nn
import random

In [15]:
# 필요한 패키지 및 모듈 임포트
import torch
import torch.nn as nn
import random

# SmallUAVEnv 클래스 정의
class SmallUAVEnv:
    def __init__(self):
        self.position = [0, 0]
        self.goal = [3, 3]  # Smaller grid
        self.actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]  # Up, Right, Down, Left

    def reset(self):
        self.position = [0, 0]
        return self.position

    def step(self, action):
        # Update position
        self.position[0] += self.actions[action][0]
        self.position[1] += self.actions[action][1]

        # Check if goal is reached
        if self.position == self.goal:
            reward = +1
            done = True
        else:
            reward = -1
            done = False

        return self.position, reward, done

# SmallUAVEnv 객체 생성
small_env = SmallUAVEnv()

# SimpleDQN 클래스 정의
class SimpleDQN(nn.Module):
    def __init__(self, input_dim, num_actions):
        super(SimpleDQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, num_actions)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# 학습 함수 train_dqn_quick_v2 수정: 보상 출력
def train_dqn_quick_v2_with_rewards(env, model, episodes=1000, gamma=0.99, epsilon=0.1, lr=0.0001, max_steps=50):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    num_actions = 4

    # To store rewards for each episode
    episode_rewards = []

    for episode in range(episodes):
        state = env.reset()
        done = False
        step_count = 0
        total_reward = 0  # Initialize total reward for this episode
        while not done and step_count < max_steps:
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(num_actions))
            else:
                q_values = model(torch.tensor(state, dtype=torch.float32))
                action = torch.argmax(q_values).item()

            next_state, reward, done = env.step(action)
            total_reward += reward  # Accumulate reward

            target = reward + gamma * torch.max(model(torch.tensor(next_state, dtype=torch.float32)))
            current_q = model(torch.tensor(state, dtype=torch.float32))[action]

            loss = criterion(current_q, target.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = next_state
            step_count += 1
        
        # Append total reward for this episode to the list
        episode_rewards.append(total_reward)
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

    return episode_rewards

# 모델 객체 생성 및 학습 (with rewards printing)
simple_model_with_rewards = SimpleDQN(input_dim=2, num_actions=4)
rewards = train_dqn_quick_v2_with_rewards(small_env, simple_model_with_rewards, episodes=100, lr=0.05, max_steps=20)

rewards



Episode 1/100, Total Reward: -20
Episode 2/100, Total Reward: -20
Episode 3/100, Total Reward: -20
Episode 4/100, Total Reward: -20
Episode 5/100, Total Reward: -20
Episode 6/100, Total Reward: -20
Episode 7/100, Total Reward: -20
Episode 8/100, Total Reward: -20
Episode 9/100, Total Reward: -20
Episode 10/100, Total Reward: -20
Episode 11/100, Total Reward: -20
Episode 12/100, Total Reward: -20
Episode 13/100, Total Reward: -14
Episode 14/100, Total Reward: -20
Episode 15/100, Total Reward: -20
Episode 16/100, Total Reward: -20
Episode 17/100, Total Reward: -20
Episode 18/100, Total Reward: -20
Episode 19/100, Total Reward: -20
Episode 20/100, Total Reward: -20
Episode 21/100, Total Reward: -20
Episode 22/100, Total Reward: -20
Episode 23/100, Total Reward: -20
Episode 24/100, Total Reward: -20
Episode 25/100, Total Reward: -20
Episode 26/100, Total Reward: -20
Episode 27/100, Total Reward: -20
Episode 28/100, Total Reward: -20
Episode 29/100, Total Reward: -20
Episode 30/100, Total R

[-20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -14,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -4,
 -20,
 -20,
 -20,
 -20,
 -20,
 -20,
 -6,
 -20,
 -20,
 -20,
 -20]