# Instalando dependências

In [None]:
from IPython.display import clear_output
import sys

IN_COLAB = 'google.colab' in sys.modules

In [None]:
if IN_COLAB:
    !git clone https://github.com/LucaLemos/UFRPE_AprendizagemReforco
    sys.path.append("/content/UFRPE_AprendizagemReforco")

    clear_output()
else:
    from os import path
    sys.path.append( path.dirname( path.dirname( path.abspath("__main__") ) ) )

In [None]:
if IN_COLAB:
    # for saving videos
    !apt-get install ffmpeg
    !pip install gymnasium==1.0.0   # conferir se precisa
    #!pip install tianshou # Para criar o Replay_Buffer
    #!pip install d3rlpy==2.7.0
    # clone repository

# Criando Dataset

In [None]:
import gymnasium as gym
import torch
from util.algorithms import run_sarsa
from util.network import ReplayBuffer
from IPython.display import clear_output

In [None]:
DATASET_SIZE = 200_000  # Tamanho do conjunto de dados (replay buffer)
LEARNING_RATE = 1e-3  # Taxa de aprendizado para o otimizador
GAMMA = 0.99  # Fator de desconto
BATCH_SIZE = 128  # Tamanho do batch para treinamento da rede neural

In [None]:
# Passo 1: Coletar um conjunto fixo de transições (Replay Buffer)
ENV_NAMES = ["FrozenLake-v1", "Taxi-v3", "CliffWalking-v0"]
ENVS_REPLAY_BUFFER = []
for i, env_name in enumerate(ENV_NAMES):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    replay_buffer = ReplayBuffer(DATASET_SIZE, BATCH_SIZE, device)
    env = gym.make(env_name, render_mode="rgb_array")
    
    sum_rewards_per_ep, q = run_sarsa(env, replay_buffer, DATASET_SIZE, LEARNING_RATE, GAMMA)
    ENVS_REPLAY_BUFFER.append((env_name, env, replay_buffer, sum_rewards_per_ep, q))
    replay_buffer.save_config(f"config\dataset\sarsa\{env_name}.json")


# Treinando o Modelo

In [None]:
import gymnasium as gym
from util.network import ReplayBuffer


In [None]:
ENV_NAMES = ["FrozenLake-v1", "Taxi-v3", "CliffWalking-v0"]

In [None]:
ENVS_REPLAY_BUFFER = []
for env_name in ENV_NAMES:
    replay_buffer = ReplayBuffer.load_config(f"config\dataset\sarsa\{env_name}.json")
    env = gym.make(env_name)
    ENVS_REPLAY_BUFFER.append((env_name, env, replay_buffer))
    

## Treino

In [None]:
ENV_NAMES = ["FrozenLake-v1", "Taxi-v3", "CliffWalking-v0"]
COUNT_EPISODES = 500
BUFFER_SIZE = 200_000
SEED = 777
MIN_EPS = 1e-2
EPS_FRAMES = 1e4
LOG_VIDEO = 0
SAVE_EVERY = 100

# Hiperparâmetros
LEARNING_RATE = 1e-3  # Taxa de aprendizado para o otimizador
GAMMA = 0.99  # Fator de desconto
TAU = 1e-3      # Taxa de atualização da target_network
ALPHA = 0.5      # Peso do termo CQL
# Extra: 
TARGET_UPDATE_FREQ = 200  # Atualizar a target network a cada 2 iterações


In [None]:
import argparse

In [None]:
def get_config(env_name, count_episodes, buffer_size, seed, min_eps, eps_frames, log_video, save_every, gamma, tau, alpha, target_update_freq, lr):
    parser = argparse.ArgumentParser(description='RL')
    parser.add_argument("--run_name", type=str, default=f"{env_name}-DQN-CQL", help="Run name, default: CQL-DQN")
    parser.add_argument("--env", type=str, default=env_name, help="Gym environment name, default: CartPole-v0")
    parser.add_argument("--episodes", type=int, default=count_episodes, help="Number of episodes, default: 200")
    parser.add_argument("--buffer_size", type=int, default=buffer_size, help="Maximal training dataset size, default: 100_000")
    parser.add_argument("--seed", type=int, default=seed, help="Seed, default: 1")
    parser.add_argument("--min_eps", type=float, default=min_eps, help="Minimal Epsilon, default: 4")
    parser.add_argument("--eps_frames", type=int, default=eps_frames, help="Number of steps for annealing the epsilon value to the min epsilon, default: 1e5")
    parser.add_argument("--log_video", type=int, default=log_video, help="Log agent behaviour to wanbd when set to 1, default: 0")
    parser.add_argument("--save_every", type=int, default=save_every, help="Saves the network every x epochs, default: 25")
    
    parser.add_argument("--gamma", type=float, default=gamma, help="Saves the network every x epochs, default: 25")
    parser.add_argument("--tau", type=float, default=tau, help="Saves the network every x epochs, default: 25")
    parser.add_argument("--alpha", type=float, default=alpha, help="Saves the network every x epochs, default: 25")
    parser.add_argument("--target_update_freq", type=float, default=target_update_freq, help="Saves the network every x epochs, default: 25")
    parser.add_argument("--lr", type=float, default=lr, help="Saves the network every x epochs, default: 25")
    
    args, _ = parser.parse_known_args()
    return args

In [None]:
config = get_config(ENV_NAMES[1], COUNT_EPISODES, BUFFER_SIZE, SEED, MIN_EPS, EPS_FRAMES, LOG_VIDEO, SAVE_EVERY, GAMMA, TAU, ALPHA, TARGET_UPDATE_FREQ, LEARNING_RATE)

In [None]:
import wandb
from collections import deque
from util.network import CQLAgent, save, to_one_hot
import numpy as np
import random
import torch

In [None]:
def train_DQN_CQL(config, buffer):
    np.random.seed(config.seed)
    random.seed(config.seed)
    torch.manual_seed(config.seed)
    env = gym.make(config.env)

    #env.seed(config.seed)
    #env.action_space.seed(config.seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    eps = 1.
    d_eps = 1 - config.min_eps
    steps = 0
    average10 = deque(maxlen=10)
    total_steps = 0
    
    with wandb.init(project="CQL", name=config.run_name, config=config):
        
        agent = CQLAgent(env.observation_space.n, env.action_space.n, config.tau, config.gamma, config.lr, device=device)

        wandb.watch(agent.network, log="gradients", log_freq=10)

        #buffer = ReplayBuffer(buffer_size=config.buffer_size, batch_size=32, device=device)
        
        #collect_random(env=env, dataset=buffer, num_samples=10000)
        
        if config.log_video:
            env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x%10==0, force=True)

        for i in range(1, config.episodes+1):
            state, _ = env.reset()
            episode_steps = 0
            rewards = 0
            while True:
                action = agent.get_action(state, eps)
                next_state, reward, terminated, trunc, _ = env.step(action[0])
                done = terminated or trunc
                buffer.add(state, action, reward, next_state, done)
                
                loss, cql_loss, bellmann_error = agent.learn(buffer.sample())
                state = next_state
                rewards += reward
                eps = max(1 - ((steps*d_eps)/config.eps_frames), config.min_eps)
                episode_steps += 1
                steps += 1
                if done:
                    break

            average10.append(rewards)
            total_steps += episode_steps
            print("Episode: {} | Reward: {} | Q Loss: {} | Steps: {}".format(i, rewards, loss, steps,))
            
            wandb.log({"Reward": rewards,
                       "Average10": np.mean(average10),
                       "Steps": total_steps,
                       "Q Loss": loss,
                       "CQL Loss": cql_loss,
                       "Bellmann error": bellmann_error,
                       "Steps": steps,
                       "Epsilon": eps,
                       "Episode": i,
                       "Buffer size": buffer.__len__()})
            """
            if (i %10 == 0) and config.log_video:
                
                mp4list = glob.glob('video/*.mp4')
                if len(mp4list) > 1:
                    mp4 = mp4list[-2]
                    wandb.log({"gameplays": wandb.Video(mp4, caption='episode: '+str(i-10), fps=4, format="gif"), "Episode": i})
            """
        save(config, save_name="CQL-DQN", model=agent.network, wandb=wandb, ep=i)

In [None]:
train_DQN_CQL(config, ENVS_REPLAY_BUFFER[1][2])

In [None]:
def to_one_hot(states, state_size): 
    one_hot_states = torch.zeros((states.shape[0], state_size), device=states.device)
    one_hot_states.scatter_(1, states.long().unsqueeze(1), 1)  
    return one_hot_states

In [None]:
states, actions, rewards, next_states, dones = ENVS_REPLAY_BUFFER[0][2].sample()
state = states[0]

print(next_states)
env = ENVS_REPLAY_BUFFER[0][1]
print(env.observation_space)




next_states = to_one_hot(next_states, env.observation_space.n)
print(next_states)
agent = CQLAgent(env.observation_space.n, env.action_space.n, config.tau, config.gamma, config.lr, hidden_size=128)
agent.target_net(next_states)



state, _ = env.reset()
print(state)
state_tensor = torch.tensor([state], dtype=torch.long)  # Criar um tensor de batch_size=1
state = torch.from_numpy(to_one_hot(state_tensor, env.observation_space.n)[0]).float().unsqueeze(0).to(agent.device)



In [None]:
def extract_policy(q_network, num_states):
    policy = []
    for state in range(num_states):
        state_tensor = to_one_hot(state, num_states).unsqueeze(0)
        q_values = q_network(state_tensor)
        policy.append(torch.argmax(q_values).item())
    return policy

In [None]:
losses = []
success_rates = []
q_values_means = []
rewards_variances = []