In [1]:
import torch
import torch.optim as optim
from src.Environment import Environment
from src.DQN import DQN
import torch.distributions as dist
from torch.distributions import Categorical
import torch as th
import torch.nn as nn
import numpy as np
import random
from collections import deque

In [2]:
# Function to train the DQN
def train_dqn(num_episodes=2000, batch_size=64, gamma=0.99, epsilon_start=1.0, epsilon_end=0.0001, epsilon_decay=0.995, lr=1e-4, memory_size=10000):
    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    env = Environment()
    policy_net = DQN().to(device)
    target_net = DQN().to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    memory = deque(maxlen=memory_size)
    epsilon = epsilon_start

    def select_action(state, epsilon):
        if random.random() < epsilon:
            return th.randint(0,4,(1,))[0]
        else:
            with th.no_grad():
                state = th.tensor(state, dtype=th.float32).unsqueeze(0).to(device)
                return policy_net(state).argmax(dim=1).item()

    def optimize_model():
        if len(memory) < batch_size:
            return
        
        batch = random.sample(memory, batch_size)
        state_batch = th.tensor(np.array([b[0] for b in batch]), dtype=th.float32).to(device)
        action_batch = th.tensor(np.array([b[1] for b in batch]), dtype=th.long).unsqueeze(1).to(device)
        reward_batch = th.tensor(np.array([b[2] for b in batch]), dtype=th.float32).to(device)
        next_state_batch = th.tensor(np.array([b[3] for b in batch]), dtype=th.float32).to(device)
        done_batch = th.tensor(np.array([b[4] for b in batch]), dtype=th.float32).to(device)

        current_q_values = policy_net(state_batch).gather(1, action_batch)
        next_q_values = target_net(next_state_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (gamma * next_q_values * (1 - done_batch))

        loss = nn.MSELoss()(current_q_values, expected_q_values.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    for episode in range(num_episodes):
        state = env.reset(45,135)
        total_reward = 0

        for t in range(1000):  # Limit the number of steps per episode
            action = select_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            optimize_model()
            
            if done:
                break

        epsilon = max(epsilon_end, epsilon_decay * epsilon)
        target_net.load_state_dict(policy_net.state_dict())

        print(f"Episode {episode + 1}, Reward: {total_reward}")

    return policy_net

In [3]:
policy_net = train_dqn()

Episode 1, Reward: 962.5
Episode 2, Reward: 1467.5
Episode 3, Reward: 1222.5
Episode 4, Reward: 1362.5
Episode 5, Reward: 1770.0
Episode 6, Reward: 922.5
Episode 7, Reward: 1532.5
Episode 8, Reward: 1685.0
Episode 9, Reward: 1827.5
Episode 10, Reward: 1662.5
Episode 11, Reward: 1545.0
Episode 12, Reward: 1425.0
Episode 13, Reward: 2107.5
Episode 14, Reward: 2255.0
Episode 15, Reward: 1917.5
Episode 16, Reward: 1900.0
Episode 17, Reward: 2595.0
Episode 18, Reward: 1295.0
Episode 19, Reward: 852.5
Episode 20, Reward: 1710.0
Episode 21, Reward: 2292.5
Episode 22, Reward: 1920.0
Episode 23, Reward: 1782.5
Episode 24, Reward: 2152.5
Episode 25, Reward: 1387.5
Episode 26, Reward: 1422.5
Episode 27, Reward: 2392.5
Episode 28, Reward: 2022.5
Episode 29, Reward: 1370.0
Episode 30, Reward: 1482.5
Episode 31, Reward: 3002.5
Episode 32, Reward: 197.5
Episode 33, Reward: 1412.5
Episode 34, Reward: 1930.0
Episode 35, Reward: 2027.5
Episode 36, Reward: 1165.0
Episode 37, Reward: 1492.5
Episode 38, Re

In [4]:
env = Environment()
frames = []
device = th.device("cuda" if th.cuda.is_available() else "cpu")
seq_actions = []
state = env.reset(45, 135)
state = th.tensor(state).float().to(device)
is_done = False
total_reward = 0
i = 0

while not is_done and i < 900:
    actions = policy_net(state.unsqueeze(0))
    minimum = abs(actions.min())
    actions += minimum
    action = actions.argmax()
    
    new_state, reward, is_done, is_landed = env.step(action)
    # print(reward)
    total_reward += reward * 0.99 ** i

    i += 1
    state = th.tensor(new_state).float().to(device)
    # print(state[3:5])

    frames.append(env.render(array=True))
    seq_actions.append(action)

print(total_reward, i)

4953.404789032786 302


In [5]:
import os
import imageio
# os.remove("prova.gif")
imageio.mimsave("prova.gif", frames, fps=30)

In [6]:
seq_actions

[tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(3),
 tensor(3),
 tensor(3),
 tensor(3),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 ten