In [1]:
import gymnasium as gym              # cria ambiente de RL
import torch                         # base do PyTorch
import torch.nn as nn                # para definir camadas de rede neural
import torch.optim as optim          # otimizadores (Adam, SGD, etc.)

In [None]:
env = gym.make("ALE/SpaceInvaders-v5")

#Definir arquitetura da rede neural
class Network(nn.Module):
    def __init__(self, dim_inputs, dim_outputs):
        super(Network, self).__init__()
        self.linear = nn.Linear(dim_inputs, dim_outputs)

    def forward(self, x):
        return self.linear(x)
    
#Instanciando rede

network = Network(dim_inputs, dim_outputs)

#Instanciando otimizador
optimizer = optim.Adam(network.parameters(), lr=0.0001)

In [None]:
for episode in range(1000):
    state, info = env.reset()
    done = False
    while not done:
        action = select_action(action, state)  # Função para selecionar ação com base na política
        next_state, reward, terminated, truncated, _ = (env.step(action))
        done = terminated or truncated
        loss = calculate_loss(network, state, action, next_state, reward, done)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        state = next_state

In [None]:
#Implementando o Q-Network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(torch.tensor(state)))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

QNetwork = QNetwork(8, 4)
optimizer = optim.Adam(QNetwork.parameters(), lr=0.0001)

In [None]:
#The Barebone DQN loss function 
def calculate_loss(q_network, state, action, next_state, reward, done):
    gamma = 0.99   # fator de desconto
    q_values = q_network(state)
    current_state_q_value = q_values[action]
    next_state_q_values = q_network(next_state).max()
    target_q_value = reward + gamma * next_state_q_values * (1 - done)
    loss = nn.MSELoss(current_state_q_value, target_q_value)
    return loss

In [None]:
from collections import deque

# Instantiate with limited capacity
buffer = deque([1,2,3,4], maxlen = 7)

#extend to right size
buffer.extend([5,6,7,8])

import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        experience_tuple = (state, action, reward, next_state, done)
        self.memory.append(experience_tuple)

    def __len__(self):
        return len(self.memory)

    def sample(self, batch_size):
        batch =  random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states_tensor = torch.tensor(states, dtype=torch.float32) 
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        next_states_tensor = torch.tensor(next_states, dtype=torch.float32)
        dones_tensor = torch.tensor(dones, dtype=torch.float32)
        actions_tensor = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
        return states_tensor, rewards_tensor, next_states_tensor, dones_tensor, actions_tensor