In [14]:
import copy
import gym
import torch

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import Tensor, nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics

In [15]:
def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

- Create Deep Q Learning

In [17]:
class DQN (nn.Module):
    def __init__(self, n_actions, hidden_size, obs_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size), # Camada linear com 4 entradas e 8 saídas
            nn.ReLU(), # Função de ativação ReLU
            nn.Linear(hidden_size, hidden_size),# Camada linear com 4 entradas e 8 saídas
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )

    def forward(self,x):
        return self.net(x.float())

- Create a Policy

In [18]:
## Create a Policy
#  técnica amplamente usada em Reinforcement Learning (RL) para balancear exploração e exploração durante o treinamento de agentes.
def epsilon_greedy(state, env, net, epsilon=0.0):
    if np.random.random() < epsilon:
        action = env.action_space.sample()  # Escolhe uma ação aleatória
    else:
        state = torch.tensor([state]).to(device)  # Converte o estado para tensor e move para o dispositivo (CPU/GPU)
        q_values = net(state)  # Calcula os valores Q(s, a) usando a rede neural
        _, action = torch.max(q_values, dim=1)  # Seleciona a ação com o maior valor Q
        action = int(action.item())  # Converte a ação para um inteiro
    return action  # Retorna a ação escolhida

- Create Buffer replay

In [19]:
#Create Buffer replay
#O principal motivo do uso do Replay Buffer em algoritmos de Reinforcement Learning (RL), especialmente no Deep Q-Learning (DQN), 
#é quebrar a correlação entre as amostras de treinamento, 
#o que leva a um aprendizado mais estável e eficiente. 
#Ele também permite o reaproveitamento de experiências, reduzindo a necessidade de interagir continuamente com o ambiente.

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

In [20]:
class RLDataset(IterableDataset):
    def __init__(self, buffer, sample_size = 200): 
        self.buffer = buffer
        self.sample_size = sample_size
    

    def __iter__(self):
        for experience in self.buffer.sample(self.sample_size):
            yield experience

In [21]:
# ##Create Enviroment

# def create_enviroment(name):
#     env = gym.make(name,  render_mode="rgb_array")
#     env = RecordVideo(env, video_folder='./videos')
#     return env


# env = create_enviroment("LunarLander-v2")

# # Reset the environment to start
# state = env.reset()
# # Run for 1000 timesteps
# for _ in range(1000):
#       # Render the environment
#     action = env.action_space.sample()  # Take a random action
#     # print("Action taken:", action)

#     # Do this action in the environment and get
#     # next_state, reward, done and info
#     _, observation, reward, done, info = env.step(action)
#     # print('Observation Space: ', observation)
#     # print('Reward: ', reward)

#     # If the episode is done (CartPole has fallen), reset the environment
#     if done:
#         state = env.reset()

# env.close()  # Close the rendering window




In [30]:
import gym

def create_enviroment(name):
    env = gym.make(name,  render_mode="rgb_array")
    env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x : x % 50 = 0)
    env - RecordEpisodeStatistics(env)
    return env

env = create_enviroment("LunarLander-v2")

env.reset()

for _ in range(10000):

   action = env.action_space.sample()

   print(action)

   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      break
      


#observation, info = env.reset()
env.close()

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


0
0
3
3
2
2
0
0
2
1
1
2
3
0
1
0
3
0
2
1
0
2
2
0
3
1
3
1
2
0
3
2
2
0
1
0
2
0
2
3
3
2
3
1
2
0
2
3
2
1
3
0
2
3
0
3
0
3
0
1
0
0
2
3
0
2
0
2
3
0
3
0
3
3
3
1
2
1
MoviePy - Building video c:\Users\igor8\Desktop\project_reinforcemen_learning\RL-LunarLander-v2\videos\rl-video-episode-0.mp4.
MoviePy - Writing video c:\Users\igor8\Desktop\project_reinforcemen_learning\RL-LunarLander-v2\videos\rl-video-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\igor8\Desktop\project_reinforcemen_learning\RL-LunarLander-v2\videos\rl-video-episode-0.mp4




In [None]:
class DeepQLearning(LightningModule):

    # Initialize
    def __init__(self, env_name, policy=epsilon_greedy, capacity=100_000, batch_size=256,
                lr=le-3, hidden_size=128, gamma=0.99, loss_fn=F.smooth_l1_loss, optim=AdamW,
                eps_start=1.0, eps_end=0.15, eps_last_episode=100, samples_per_epoch=10_000, sync_rate=10
                ):
        super().__init__()
        self.env = create_enviroment(env_name)

        obs_size = self.env.observation_space.shape[0]

        n_actions = self.env.action_space.n

        self.q_net = DQN (hidden_size, obs_size, n_actions)

        self.target_q_net = copy.deepcopy(self.q_net)

        self.policy = policy
        self.buffer  = ReplayBuffer(capacity=capacity)

        self.save_hyperparameters()

        while len(self.buffer) < self.hparams.samples_per_epoch:
            print(f"{len(self.buffer)} Sample in experience buffer. Filling...")
            self.play_episode(epsilon=self.hparams.eps_start)

        @torch.no_grad()
        def play_episode (self, epsilon = 0.):
            state = self.env.reset()
            done = False


            
            while not done:

                if  policy:
                    action = self.policy(state, self.env, self.q_net, epsilon=epsilon)
                else:
                    action = self.env.action_space.sample()

                action = policy(state, self.env, self.q_net, epsilon=epsilon)
                next_state, reward, done, info = self.env.step(action)
                exp = (state, action, reward, done, next_state )
                self.buffer.append(exp)
                state = next_state


    # Forward

    def forward(self, x):
        return self.q_net(x)

    # Configure optimizers
    def configure_optimizers(self):
        q_net_optimizer = self.hparams.optim(self.q_net.parameters(), lr=self.hparams.lr)
        return [q_net_optimizer]

    # Create dataloader
    def train_dataloader(self):
        dataset = RLDataset(self.buffer,self.hparams.samples_per_epoch)
        dataloader = DataLoader(
            dataset=dataset,
            batch_size=self.hparams.batch_size
        )
        return dataloader

    # Training step

    def training_step(self, batch, batch_idx):
        states, actions, rewards, dones, next_states = batch
        action = actions.unsqueeze()  
        rewards = rewards.unsqueeze()
        dones = dones.unsqueeze()

    # Training epoch end


    

 