Héctor Asorey de Pablos

# APRENDIZAJE AUTOMÁTICO II - TRABAJO 2

Para esta práctica se ha optado por utilizar aprendizaje por refuerzo para completar el nivel 1-1 del juego Super Mario Bros para la Nintendo Entertainment System, lanzado en 1885.

Para ello, se va a utilizar el entorno de Gym "gym_super_mario_bros", y se van a entrenar modelos DQNAgent, Double DQNAgent y PPO.

<hr>

### Importar las librerías necesarias

In [None]:
import os
import gym
import numpy as np
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.agents.dqn import DQNAgent
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

<hr>

### Crear el entorno del juego

In [None]:
env = gym.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, RIGHT_ONLY)

<hr>

## Modelos DQNAgent o DDQNAgent

<hr>

### Definir parámetros del entorno para los modelos

In [None]:
n_actions = len(RIGHT_ONLY)
height, width, n_channels = env.observation_space.shape

<hr>

### Construir modelo CNN

In [None]:
def build_cnn_model():
    model = Sequential()
    model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(5, height, width, n_channels)))
    model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(n_actions, activation='linear'))
    return model

model = build_cnn_model()

<hr>

### Definir la memoria y la política de exploración del modelo

In [None]:
memory = SequentialMemory(limit=1000000, window_length=5)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=37000)

<hr>

### Construir el modelo DQNAgent o DDQNAgent

In [None]:
dqn = DQNAgent(model=model,
               nb_actions=n_actions,
               memory=memory,
               nb_steps_warmup=4000,#50000,
               target_model_update=2000,#10000,
               policy=policy,
               #Quitar la línea de abajo en caso de no querer Double DQN
               enable_double_dqn = True)

dqn.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-4))

<hr>

### Entrenamiento del modelo

In [None]:
dqn.fit(env, nb_steps=33000, visualize=False, verbose=1)

<hr>

### Forma alternativa de entrenar el modelo

Se puede entrenar el modelo de forma distinta para saber el reward que obtiene el modelo a cada paso que da gracias a un callback

<hr>

### Definición del callback

In [None]:
class PrintRewardCallback(tf.keras.callbacks.Callback):
    def on_step_end(self, step, logs={}):
        if 'reward' in logs:
            print(f"Step {step}: reward = {logs['reward']}")

<hr>

### Entrenamiento del modelo

In [None]:
dqn.fit(env, nb_steps=42000, visualize=False, verbose=2, callbacks=[PrintRewardCallback()])

<hr>

### Guardar el modelo

In [None]:
model.save('mario_33_dqn.h5')

<hr>

### Probar el desempeño del modelo en el juego

In [None]:
from tensorflow.keras.models import load_model

modelToTest = load_model('mario_dqn_2000.h5')

# Define the memory buffer and the exploration policy
memory = SequentialMemory(limit=1000000, window_length=3)
policy = EpsGreedyQPolicy()

# Create the DQN agent
dqn = DQNAgent(model=modelToTest,
               nb_actions=n_actions,
               memory=memory,
               nb_steps_warmup=0,
               target_model_update=1000,
               policy=policy,
               enable_double_dqn = True)

# Compile the DQN agent
dqn.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-4))

# Evaluate the agent for 10 episodes
dqn.test(env, nb_episodes=10, visualize=True)

<hr>

### Reentrenar el modelo donde lo dejó en caso de ser necesario

In [None]:
from tensorflow.keras.models import load_model

model = load_model('mario_33_dqn.h5')

# Define the memory buffer and the exploration policy
memory = SequentialMemory(limit=1000000, window_length=5)
policy = EpsGreedyQPolicy()

# Create the DQN agent
dqn = DQNAgent(model=model,
               nb_actions=n_actions,
               memory=memory,
               nb_steps_warmup=0,
               target_model_update=1000,
               policy=policy)

# Compile the DQN agent
dqn.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-4))

# Evaluate the agent for 10 episodes
dqn.test(env, nb_episodes=10, visualize=True)

<hr>

## Modelo PPO

<hr> 

### Importar librerías necesarias

In [None]:
from stable_baselines3 import PPO

<hr>

### Creación del modelo

In [None]:
model2 = PPO('CnnPolicy', env, verbose=1, tensorboard_log='./logs/', learning_rate=0.000001, 
            n_steps=512) 

<hr>

### Creación de callbacks

Este callback guarda el modelo cada n pasos definidos

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}_v4'.format(self.n_calls))
            self.model.save(model_path)

        return True
    
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

<hr>

### Entrenamiento del modelo

In [None]:
model2.learn(total_timesteps=200000, callback = callback)

<hr>

### Guardar el modelo

In [None]:
model2.save('model_20000')

<hr>

### Cargar y probar el modelo entrenado en el juego

In [None]:
import stable_baselines3

In [None]:
 model3 = PPO.load('./train/best_model_100000_v4')

In [None]:
# Start the game 
state = env.reset()

# Loop through the game
while True: 
    # Make a copy of the state array
    state_copy = state.copy()

    # Predict the action
    action, _ = model3.predict(state_copy)

    # Take the action in the environment
    state, reward, done, info = env.step(action)

    env.render()


<hr>

### Reentrenar el modelo PPO donde lo dejó en caso de ser necesario

In [None]:
env = gym.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, RIGHT_ONLY)

model3.set_env(env)

model3.learn(total_timesteps=200000, callback = callback)