**Comentarios**

1. No se implementó ni NES ni GA.
2. Algo de mi implementación siento que no me permite graficar de buena forma los resultados. Ayuda por favor.

In [1]:
import gymnasium as gym
import math, random, matplotlib, warnings, torch, ale_py, stable_baselines3, collections
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import typing as tt

from torch.distributions import Categorical
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.env_util import make_vec_env

dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dev)

cuda


In [2]:
donkey_kong = gym.make("ALE/DonkeyKong-v5")
breakout = gym.make("ALE/Breakout-v5")

print(f"Donkey Kong. Observation Space: {donkey_kong.observation_space.shape}. Action Space: {donkey_kong.action_space.n}")
print(f"Breakout. Obs Space {breakout.observation_space.shape}. Action Space: {breakout.action_space.n}")

Donkey Kong. Observation Space: (210, 160, 3). Action Space: 18
Breakout. Obs Space (210, 160, 3). Action Space: 4


## Graficar

In [3]:
def print_plot(reward_recs, title):
    avg_reward = []
    for _ in range(len(reward_recs)):
        avg = np.empty(shape=(1,), dtype =int)
        if _ < 20:
            avg = reward_recs[:_+1]
        else:
            avg = reward_recs[_-19:_+1]
        avg_reward.append(np.average(avg))
    plt.plot(reward_recs)
    plt.plot(avg_reward)
    plt.title(title)
    plt.ylabel("Recompensa")

## Clases para la implementación

Se tiene una implementación independiente para cada algoritmo en cuestión. Si bien son implementaciones bastante similares, cada clase presenta diferencias de implementación en función del algortimo para el que se estén planteando.

### Políticas (Redes neuronales)

In [4]:
# Política para PPO
class Conv_Policy(nn.Module):
    """
    Esta clase implementa todo lo necesario para que el agente interactúe en el ambiente.
    El entrenamiento de las redes correspondientes (self.actor, self.value) se realiza 
    durante la implementación misma del algoritmo.
    """
    def __init__(
        self,
        input_shape: tt.Tuple[int, ...],
        n_actions: int
    ):
        super(Conv_Policy, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, kernel_size = 3, stride = 4),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size = 1, stride = 2),
            nn.ReLU(),
            nn.Flatten(0)
        )
        
        with torch.no_grad():
            size = self.conv(torch.zeros(1, *input_shape)).flatten().size()[-1]
            
        self.actor = nn.Sequential(
            self.conv,
            nn.Linear(size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
            nn.Softmax(dim=-1)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, kernel_size = 3, stride = 4),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size = 1, stride = 2),
            nn.ReLU(),
            nn.Flatten(0)
        )
        
        self.critic = nn.Sequential(
            self.conv2,
            nn.Linear(size, 256),
            nn.ReLU(),
            nn.Linear(256, 32),
            nn.ReLU(),
            nn.Linear(32,1)
        )

    def forward(self):
        raise NotImplementedError

    def act(self, state):
        """
        Nos permite encontrar una acción siguiendo nuestra política.
        """
        
        action_proba = self.actor(state)
        distribution = Categorical(action_proba)

        action = distribution.sample()
        action_log_prob = distribution.log_prob(action)
        state_value = self.critic(state) 

        return action.detach(), action_log_prob.detach(), state_value.detach()

    def evaluate(self, state, action):
        action_probs = self.actor(state)
        distribution = Categorical(action_probs)
        action_log_prob = distribution.log_prob(action)
        distribution_entropy = distribution.entropy()
        state_values = self.critic(state)

        return action_log_prob, state_values, distribution_entropy

In [5]:
# Política para DDQN
class Conv_PolicyDDQN(nn.Module):
    """
    Se modifica la clase de Conv_Policy. Ya no se tiene una red de actor y otra de crítico.
    También se simplifica el proceso de elegir una acción en función de la red. 
    """
    def __init__(
        self,
        input_shape: tt.Tuple[int, ...],
        n_actions: int
    ):
        super(Conv_PolicyDDQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, kernel_size = 3, stride = 4),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size = 1, stride = 2),
            nn.ReLU(),
            nn.Flatten(0)
        )
        
        with torch.no_grad():
            size = self.conv(torch.zeros(1, *input_shape)).flatten().size()[-1]
            
        self.Qnetwork = nn.Sequential(
            self.conv,
            nn.Linear(size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
            nn.Softmax(dim=-1) 
        )

    def forward(self):
        raise NotImplementedError

    def act(self, state):
        """
        Nos permite encontrar una acción siguiendo nuestra política.
        Como buscamos muestrear podemos no considerar state_value, ni log_prob.
        """
        action_proba = self.Qnetwork(state)
        distribution = Categorical(action_proba)

        action = distribution.sample()
        return action.detach()

### Buffers

In [6]:
class Buffers:
    """
    Esta clase nos permite guardar información sobre la ejecución de nuestro algoritmo.
    """
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []
        self.is_terminals = []

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.state_values[:]
        del self.is_terminals[:]


class Buffers_DDQN:
    """
    Esta clase nos permite guardar información sobre la ejecución de nuestro algoritmo.
    A diferencia de PPO, no necesitamos los valores del estado ni las logprobs.
    """
    def __init__(self):
        self.actions = []
        self.states = []
        self.rewards = []
        self.is_terminal = []

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.rewards[:]
        del self.is_terminal[:]

    def sample(self):
        """
        Vamos a muestrear a partir del buffer cargado dentro de una interacción con el ambiente.
        """
        # Extracción aleatoria a partir de la muestra
        max_range = len(self.states)
        random_index = random.randint(0, max_range-2)
        action = self.actions[random_index] 
        state = self.states[random_index]
        next_state = self.states[random_index+1]
        reward = [self.rewards[random_index]]
        term = [self.is_terminal[random_index]]

        # Formateo de datos para procesamiento posterior.
        # Odio a pytorch
        if type(action) == int:
            action = torch.tensor(action, dtype = torch.float32)
        else:
            action = action.to(torch.float32).to(dev)
        state = torch.from_numpy(state).to(dev)
        state = state.to(torch.float32)

        next_state = torch.from_numpy(next_state).to(dev)
        next_state = next_state.to(torch.float32)

        reward = torch.Tensor(reward).to(dev)
        term = torch.Tensor(term).to(dev)
            
        return state, action, reward, next_state, term

### Algoritmos

**PPO**

In [7]:
class PPO_Self:
    """
    La clase PPO contiene la información relevante para la implementación del algoritmo mismo.
    El entrenamiento se hace mediante una función externa que trabaja sobre una instancia de esta clase.
    """
    def __init__(
        self,
        state_dim,
        action_dim,
        lr_actor,
        lr_critic,
        gamma,
        epochs,
        clip,
    ):
        self.gamma = gamma
        self.clip = clip
        self.epochs = epochs

        # Definimos políticas, 
        
        self.buffer = Buffers()
        self.policy = Conv_Policy(state_dim, action_dim).to(dev)
        self.optimizer = torch.optim.Adam([
                {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                {'params': self.policy.critic.parameters(), 'lr': lr_critic}
            ])
        self.policy_prev = Conv_Policy(state_dim, action_dim).to(dev)
        self.Mseloss = nn.MSELoss()

    def select_action(self, state):
        """
        Elección de acción en función de la política previa.
        """
        with torch.no_grad():
            state = torch.FloatTensor(state).to(dev)
            action, action_log_prob, state_val = self.policy_prev.act(state)
            
        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_log_prob)
        self.buffer.state_values.append(state_val)
        return action.item()

    def update(self):
        """
        Actualización de pesos, esta función se llama durante el entrenamiento.
        """
        rewards = []
        disc_reward = 0
        for reward, is_term in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_term:
                disc_reward = 0
            disc_reward = reward + (self.gamma * disc_reward)
            rewards.insert(0, disc_reward)

        rewards = torch.tensor(rewards, dtype = torch.float32).to(dev)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)

        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(dev)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(dev)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(dev)
        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(dev)

        advantage = rewards.detach() - old_state_values.detach()
        
        for _ in range(self.epochs):
            for state in old_states:
                logprobs, state_values, dist_entropy = self.policy.evaluate(state, old_actions)
                state_values = torch.squeeze(state_values)
                ratios = torch.exp(logprobs - old_logprobs.detach())

                # CLIP
                surrogate1 = ratios*advantage
                surrogate2 = torch.clamp(ratios, 1-self.clip, 1+self.clip) * advantage
    
                loss = -torch.min(surrogate1, surrogate2) + 0.5*self.Mseloss(state_values, rewards) - 0.01*dist_entropy

                # Actualización de pesos
                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()

        self.policy_prev.load_state_dict(self.policy.state_dict())
        self.buffer.clear()

**DDQN**

In [8]:
class DDQN_Self:
    """
    Esta implementación se hace sobre la segunda versión de DDQN propuesta por Hasselt en el 2015.
    En vez de trabajar con dos políticas y modificarlas de manera aleatoria, se tiene una política base
    Y una política objetivo que se busca aproximar. 
    """
    def __init__(
        self,
        state_dim,
        action_dim,
        lr,
        gamma,
        tau,
        epochs
    ):
        self.gamma = gamma
        self.epochs = epochs
        self.tau = tau
        
        self.action_dim = action_dim #hmmmmmmm
        
        self.buffer = Buffers_DDQN()
        self.model = Conv_PolicyDDQN(state_dim, action_dim).to(dev)
        self.target = Conv_PolicyDDQN(state_dim, action_dim).to(dev)

        for target_net, net in zip(self.model.parameters(), self.target.parameters()):
            target_net.data.copy_(net)

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = lr)

    def select_action(self, state):
        explore = 0.1
        with torch.no_grad():
            state = torch.FloatTensor(state).to(dev)
            action = self.model.act(state) 
            if np.random.rand() < explore:
                action = random.randrange(self.action_dim)
            return action

    def q_loss(self):
        state, action, reward, next_state, term = self.buffer.sample()
        
        Q_actual = self.model.act(state)
        Q_sig = self.target.act(next_state)
        Q_Esperado = reward + (1-term) * self.gamma * Q_sig

        q_loss = F.mse_loss(Q_actual, Q_Esperado.detach())
        return q_loss

    def update(self):
        """
        Actualización de los parámetros
        """
        loss = self.q_loss()

        self.optimizer.zero_grad()
        loss.requires_grad = True
        loss.backward()
        self.optimizer.step()

        # Actualización siguiendo Hasselt (2015)
        for target_net, net in zip(self.model.parameters(), self.target.parameters()):
            target_net.data.copy_(self.tau * net + (1 - self.tau) * target_net)

### Entrenamiento

In [9]:
def train_loopPPO(max_steps, PPOAgent, env):
    """
    Genera el proceso de entrenamiento.
    
    max_steps = int ; Pasos de entrenamiento totales.
    PPOAgent = PPO() ; Instancia de PPO por trabajar.
    env = gym.env() ; Entorno del gym a trabajar. Por construcción de esta implementación debe de ser uno de estados con representación visual.
    Esto es debido a que la red neuronal de nuestra política es convolucional y trabaja sobre los estados visuales del entorno.
    """
    # Hiperparámetros del entrenamiento
    max_ep_len = 1000
    update_timestep = 5
    print_timestep = 20
    episode = 0

    # Parámetros para imprimir información del entrenamiento
    print_running_reward = 0
    print_running_episodes = 0
    print_freq = 100

    best_train_reward = []
    best_time_steps = []

    reward_records =[] # Para graficar la recompensa
    
    # Loop de entrenamiento
    for episode_count in range(PPOAgent.epochs):
        
        time_step = 0
        epoch_rewards = []
        epoch_timesteps = []
        
        print(f"Iniciando época: {episode_count+1}")
        
        while time_step <= max_steps:
            
            state, obs = env.reset()
            current_ep_reward = 0
            
            for t in range(1, max_ep_len+1):
                # Loop de entrenamiento clásico de Gymnasium
                action = PPOAgent.select_action(state)
                state, reward, term, trunc, info = env.step(action)
    
                PPOAgent.buffer.rewards.append(reward)
                PPOAgent.buffer.is_terminals.append(term)
    
                time_step += 1
                current_ep_reward += reward
                
                if t % update_timestep == 0:
                    PPOAgent.update()
                    epoch_rewards.append(current_ep_reward)
                    epoch_timesteps.append(time_step)
    
                if t % print_freq == 0:
                    if print_running_episodes != 0:
                        print_avg_reward = print_running_reward / print_running_episodes
                        print_avg_reward = round(print_avg_reward, 2)
                        print("Episodio : {} \t\t Timestep : {} \t\t Avg Reward : {}".format(episode+1, time_step, print_avg_reward))
                        print_running_reward = 0
                        print_running_episodes = 0
    
                if term:
                    print("Finished at time step: {}".format(time_step))
                    break

    
            print_running_reward += current_ep_reward
            print_running_episodes += 1

        if len(best_train_reward) == 0:
            best_train_reward = epoch_rewards

        best_train = np.array(best_train_reward)
        epoch_r = np.array(epoch_rewards)
        print(f"Premios previos: {best_train.mean()}. Previos actuales: {epoch_r.mean()}")

        if best_train.mean() < epoch_r.mean():
            best_train_reward = epoch_rewards
            best_time_steps = epoch_timesteps

        
        reward_records.append(epoch_rewards) # Graficar

        episode += 1
        print("Fin episodio {}".format(episode))
        print("----------------------------")

    print("Acabamos")
    print(reward_records)
    print_plot(reward_records, "PPO")
    env.close()

In [10]:
def train_loopDDQN(update_steps, DDQNAgent, env):
    explore_steps = 5000 # Modificable
    save_timestep = 5
    print_timestep = 20
    episode = 0

    running_reward = 0
    running_eps = 0

    best_train_reward = []
    
    reward_records =[] # Para graficar la recompensa
    
    for episode_count in range(DDQNAgent.epochs):
        time_step = 0
        print(f"Época: {episode_count+1}")
        state, obs = env.reset()
        current_ep_reward = 0
        epoch_rewards = []
        # Primer for (Muestreo)
        for t in range(1, explore_steps+1):
            action = DDQNAgent.select_action(state)
            state, reward, term, trunc, info = env.step(action)
            DDQNAgent.buffer.actions.append(action)
            DDQNAgent.buffer.states.append(state)
            DDQNAgent.buffer.rewards.append(reward)
            DDQNAgent.buffer.is_terminal.append(term)
            current_ep_reward += reward

            if t % save_timestep == 0:
                epoch_rewards.append(current_ep_reward)

        
        reward_arr = np.array(epoch_rewards)
        print(f"Recompensa promedio: {reward_arr.mean()}")
        print("--------------------")

        # Segundo for (Update)
        for t in range(1, update_steps):
            DDQNAgent.update()

        # Se resetea el buffer para la siguiente época
        DDQNAgent.buffer.clear()
        reward_records.append(epoch_rewards)

    print("Acabamos")
    #print(reward_records)
    print_plot(reward_records, "DDQN")
    env.close()

### PlayLoop

In [11]:
def play_test(Agent, total_episodes, env):
    """
    Hace un loop de prueba de la política aprendida

    PPOAgent = PPO() ; Una instancia de PPO ENTRENADA
    total_episodes = int ; Cantidad de episodios que se busca probar.
    env = gym.env() ; Entorno del gymnasium (Claramente debe de ser el mismo en donde se entrenó PPOAgent())
    """
    max_ep_len = 3000
    test_reward = 0
    for ep in range(1, total_episodes + 1):
        ep_reward = 0
        obs, info = env.reset()
        for t in range(1, max_ep_len+1):
            action = Agent.select_action(obs)
            state, reward, term, trunc, _ = env.step(action)
            ep_reward += reward

            if term or trunc:
                break

        Agent.buffer.clear()
        test_reward += ep_reward
    env.close()
    print(f"Recompensa promedio: {round(test_reward/total_episodes, 3)}")

## Ejecuciones completas

In [None]:
%%time

# PPO DonkeyKong

epochs = 15
clip = 0.175 # El mejor valor empírico del clip termina siendo 0.15
gamma = 0.99
lr_actor = 0.0003
lr_critic = 0.0001

PPO_DK = PPO_Self(donkey_kong.observation_space.shape, donkey_kong.action_space.n, lr_actor, lr_critic, gamma, epochs, clip)
train_loopPPO(2000, PPO_DK, donkey_kong) # max_steps tiene que ser aprox 3000, menos de eso no aprende, más de eso decae y toma demasiado tiempo
play_test(PPO_DK, 10, donkey_kong)

Iniciando época: 1


  return F.mse_loss(input, target, reduction=self.reduction)


Finished at time step: 740
Episodio : 1 		 Timestep : 840 		 Avg Reward : 0.0
Finished at time step: 1479


  return F.mse_loss(input, target, reduction=self.reduction)


Episodio : 1 		 Timestep : 1579 		 Avg Reward : 0.0
Premios previos: 0.0. Previos actuales: 0.0
Fin episodio 1
----------------------------
Iniciando época: 2
Episodio : 2 		 Timestep : 100 		 Avg Reward : 0.0
Episodio : 2 		 Timestep : 1100 		 Avg Reward : 0.0
Episodio : 2 		 Timestep : 2100 		 Avg Reward : 0.0
Premios previos: 0.0. Previos actuales: 0.0
Fin episodio 2
----------------------------
Iniciando época: 3
Episodio : 3 		 Timestep : 100 		 Avg Reward : 0.0
Episodio : 3 		 Timestep : 1100 		 Avg Reward : 0.0
Episodio : 3 		 Timestep : 2100 		 Avg Reward : 0.0
Premios previos: 0.0. Previos actuales: 0.0
Fin episodio 3
----------------------------
Iniciando época: 4
Episodio : 4 		 Timestep : 100 		 Avg Reward : 0.0
Episodio : 4 		 Timestep : 1100 		 Avg Reward : 0.0
Episodio : 4 		 Timestep : 2100 		 Avg Reward : 0.0
Premios previos: 0.0. Previos actuales: 0.0
Fin episodio 4
----------------------------
Iniciando época: 5
Episodio : 5 		 Timestep : 100 		 Avg Reward : 0.0
Epis

In [None]:
%%time

# PPO Breakout

PPO_Breakout = PPO_Self(breakout.observation_space.shape, breakout.action_space.n, lr_actor, lr_critic, gamma, epochs, clip)
train_loopPPO(1500, PPO_Breakout, breakout) # 1000 max_steps hace que entrene en fa, de 2000 en adelante toma años.
play_test(PPO_Breakout, 10, breakout)

In [None]:
%%time

# DDQN DonkeyKong

lr = 0.0003
tau = 0.001
gamma = 0.99
epochs = 25 # Tienen que ser varias épocas para que jale chido

DDQN_DonkeyHijoDePerraKong = DDQN_Self(donkey_kong.observation_space.shape, donkey_kong.action_space.n, lr, gamma, tau, epochs)
train_loopDDQN(25, DDQN_DonkeyHijoDePerraKong, donkey_kong) 
play_test(DDQN_DonkeyHijoDePerraKong, 10, donkey_kong)

In [None]:
%%time

# DDQN Breakout

DDQN_Breakout = DDQN_Self(breakout.observation_space.shape, breakout.action_space.n, lr, gamma, tau, 50)
train_loopDDQN(30, DDQN_Breakout, breakout)
play_test(DDQN_Breakout, 10, breakout)

## Baselines

In [None]:
def StableBaselines3(algoritmo, env, policy, timesteps):
    model = algoritmo(policy, env)
    model.learn(total_timesteps = timesteps)
    print("Done")
    return model

def SB3_playloop(model, env, loop_iter):
    """
    model = PPO/DQN ; Modelo de Stabe-Baselines3 entrenado
    env = gym.make() ; Entorno del gym.
    loop_iter = int ;  Cantidad de veces que probaremos al modelo.
    """
    obs, info = env.reset()
    avg_reward = 0
    for _ in range(loop_iter):
        iterations_reward = 0
        done = False
        while not done:
            action = model.predict(obs)
            state, reward, term, trunc = env.step(action)
            iterations_reward += reward
            done = term or trunc
        avg_reward += iterations_reward
        
    print(f"Ganancia promedio después de {loop_iter} iteraciones: {avg_reward/loop_iter}")

In [None]:
%%time
ppo_dk = StableBaselines3(PPO, donkey_kong, "CnnPolicy", 200)
#dqn_dk = StableBaselines3(DQN, donkey_kong, "CnnPolicy", 200)

ppo_breakout = StableBaselines3(PPO, breakout, "CnnPolicy", 200)
#dqn_breakout = StableBaselines3(DQN, breakout, "CnnPolicy", 200)

In [None]:
SB3_playloop(ppo_dk, donkey_kong, 10)
SB3_playloop(ppo_breakout, breakout, 10)