# Monte-Carlo Off-Police

### Instalação de pacotes

In [2]:
from IPython.display import clear_output
!apt-get install ffmpeg freeglut3-dev xvfb  
!pip install gym[all]==00.25.1
!pip install gym[atari,accept-rom-license]==00.25.1
!pip install pyglet
!pip install stable-baselines3[extra]
!pip install optuna
clear_output()

In [3]:
!mkdir log_project

### Imports


In [4]:
import gym
import numpy as np
import tensorboard
import matplotlib.pyplot as plt
import time

%load_ext tensorboard

import gym
import numpy as np


### Para salvar vídeo

In [5]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

A gravação é feita com o wrapper [VecVideoRecorder](https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html#vecvideorecorder).

In [7]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record the given number of steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [8]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

# Plot Result

In [9]:
def test_greedy_Q_policy(env, Q, num_episodes=100, render=False, render_wait=0.01):
    """
    Avalia a política gulosa (greedy) definida implicitamente por uma Q-table.
    Ou seja, executa, em todo estado s, a ação "a = argmax Q(s,a)".
    - env: o ambiente
    - Q: a Q-table (tabela Q) que será usada
    - num_episodes: quantidade de episódios a serem executados
    - render: defina como True se deseja chamar `env.render()` a cada passo
    - render_wait: intervalo de tempo entre as chamadas a `env.render()`
    
    Retorna:
    - um par contendo o valor escalar do retorno médio por episódio e 
       a lista de retornos de todos os episódios
    """
    episode_returns = []
    total_steps = 0
    for i in range(num_episodes):
        print(f"Episode {i+1}")
        obs = env.reset()
        if render:
            env.render()
            time.sleep(render_wait)
        done = False
        episode_returns.append(0.0)
        while not done:
            action = np.argmax(Q[obs])
            obs, reward, done, _ = env.step(action)
            if render:
                env.render()
                time.sleep(render_wait)
            total_steps += 1
            episode_returns[-1] += reward
        print("- retorno:", episode_returns[-1])
    mean_return = round(np.mean(episode_returns), 1)
    print("Retorno médio (por episódio):", mean_return, end="")
    print(", episódios:", len(episode_returns), end="")
    print(", total de passos:", total_steps)
    return mean_return, episode_returns

In [10]:
def smooth(data, window):
  data = np.array(data)
  n = len(data)
  y = np.zeros(n)
  for i in range(n):
    start = max(0, i-window+1)
    y[i] = data[start:(i+1)].mean()
  return y

def plot_result(returns, ymax_suggested=None, window=100, filename=None):
    '''Exibe um gráfico "retornos x recompensas", fazendo a média a cada 100 retornos, para suavizar.     
    Se o parâmetro filename for fornecido, salva o gráfico em arquivo ao invés de exibir.
    
    Parâmetros:
    - returns: lista de retornos a cada episódio
    - ymax_suggested (opcional): valor máximo de retorno (eixo y), se tiver um valor máximo conhecido previamente
    - filename: indique um nome de arquivo, se quiser salvar a imagem do gráfico; senão, o gráfico será apenas exibido
    '''
    plt.figure(figsize=(14,8))
    smoothed_returns = smooth(returns, window)
    xvalues = np.arange(1, len(returns)+1)
    plt.plot(xvalues, smoothed_returns)
    plt.xlabel('Episódios')
    plt.ylabel('Retorno')
    if ymax_suggested is not None:
        ymax = np.max([ymax_suggested, np.max(smoothed_returns)])
        plt.ylim(top=ymax)
    plt.title(f"Retorno médio a cada {window} episódios")
    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
        print("Arquivo salvo:", filename)
    plt.close()


## Código

In [53]:
from numpy.random.mtrand import gamma

import optuna

#from wrappers import DiscreteObservationWrapper


ENV = gym.make("Taxi-v3")


# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_values(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    #bins1 = trial.suggest_int('bins1', 5, 100)
    #bins2 = trial.suggest_int('bins2', 5, 100)
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}")

    # roda o algoritmo e recebe os retornos não-descontados
    #env_wrapper = DiscreteObservationWrapper(ENV, [bins1,bins2])
    (returns, _) = run_montecarloOffP(env, 20000, gamma, eps, render=False)
    return sum(returns[-100:])/100 

In [None]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice', 
                            load_if_exists=True)
study.optimize(train_values, n_trials=20) 

In [60]:
ENV_NAME = "Taxi-v3"  
#ENV_NAME = "MountainCarContinuous-v0"  
#ENV_NAME = "LunarLander-v2"  
env = gym.make(ENV_NAME)

### Off-policy

In [51]:
def choose_action(Q, state):
    return np.argmax(Q[state])

def choose_actionB(num_actions):
    return np.random.randint(0, num_actions)

# Algoritmo Monte-Carlo de Controle, variante "toda-visita".
# Atenção: os espaços de estados e de ações precisam ser discretos, dados por valores inteiros
def run_montecarloOffP(env, episodes, gamma=0.95, epsilon=0.1, render=False):
    assert isinstance(env.observation_space, gym.spaces.Discrete)
    assert isinstance(env.action_space, gym.spaces.Discrete)
    
    num_actions = env.action_space.n
    
    # inicializa a tabela Q toda com zero,
    # usar o estado como índice das linhas e a ação como índice das colunas
    Q = np.zeros(shape = (env.observation_space.n, num_actions))
    C = np.zeros(shape = (env.observation_space.n, num_actions))

    # para cada episódio, guarda sua soma de recompensas (retorno não-discontado)
    sum_rewards_per_ep = []

    # loop principal
    for i in range(episodes):
        done = False
        sum_rewards, reward = 0, 0
        ep_trajectory = []
        
        state = env.reset()
    
        # PARTE 1: executa um episódio completo
        while done != True:   
            # exibe/renderiza os passos no ambiente, durante 1 episódio a cada mil e também nos últimos 5 episódios 
            if render and (i >= (episodes - 5) or (i+1) % 1000 == 0):
                env.render()
                
            # escolhe a próxima ação -- usa epsilon-greedy
            action = choose_action(Q, state)
        
            # realiza a ação, ou seja, dá um passo no ambiente
            next_state, reward, done, _ = env.step(action)
            
            # adiciona a tripla que representa este passo
            ep_trajectory.append( (state, action, reward) )
            
            sum_rewards += reward
            state = next_state
        
        sum_rewards_per_ep.append(sum_rewards)

        # a cada 100 episódios, imprime informação sobre o progresso 
        if (i+1) % 100 == 0:
            avg_reward = np.mean(sum_rewards_per_ep[-100:])
            print(f"Episode {i+1} Average Reward (last 100): {avg_reward:.3f}")

        # PARTE 2: atualiza Q (e a política, implicitamente)
        Gt = 0
        W = 1
        for (s, a, r) in reversed(ep_trajectory):
            Gt = r + gamma*Gt
            C[s,a] = C[s,a] + W
            delta = W * (Gt - Q[s,a])
            Q[s,a] = Q[s,a] + (1/C[s,a])* delta
            best = choose_action(Q,s)
            if best == choose_actionB(num_actions): break
            W = W*(1/(1/num_actions))

    return sum_rewards_per_ep, Q


### On-Policy

In [54]:
# Esta é a política. Neste caso, escolhe uma ação com base nos valores
# da tabela Q, usando uma estratégia epsilon-greedy.
def pi_policy(Q, state, num_actions, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(0, num_actions)
    else:
        return np.argmax(Q[state])


# Algoritmo Monte-Carlo de Controle, variante "toda-visita".
# Atenção: os espaços de estados e de ações precisam ser discretos, dados por valores inteiros
def run_montecarloOnP(env, episodes, lr=0.1, gamma=0.95, epsilon=0.1, render=False):
    assert isinstance(env.observation_space, gym.spaces.Discrete)
    assert isinstance(env.action_space, gym.spaces.Discrete)
    
    num_actions = env.action_space.n
    
    # inicializa a tabela Q toda com zero,
    # usar o estado como índice das linhas e a ação como índice das colunas
    Q = np.zeros(shape = (env.observation_space.n, num_actions))

    # para cada episódio, guarda sua soma de recompensas (retorno não-discontado)
    sum_rewards_per_ep = []

    # loop principal
    for i in range(episodes):
        done = False
        sum_rewards, reward = 0, 0
        ep_trajectory = []
        
        state = env.reset()
    
        # PARTE 1: executa um episódio completo
        while done != True:   
            # exibe/renderiza os passos no ambiente, durante 1 episódio a cada mil e também nos últimos 5 episódios 
            if render and (i >= (episodes - 5) or (i+1) % 1000 == 0):
                env.render()
                
            # escolhe a próxima ação -- usa epsilon-greedy
            action = pi_policy(Q, state, num_actions, epsilon)
        
            # realiza a ação, ou seja, dá um passo no ambiente
            next_state, reward, done, _ = env.step(action)
            
            # adiciona a tripla que representa este passo
            ep_trajectory.append( (state, action, reward) )
            
            sum_rewards += reward
            state = next_state
        
        sum_rewards_per_ep.append(sum_rewards)

        # a cada 100 episódios, imprime informação sobre o progresso 
        if (i+1) % 100 == 0:
            avg_reward = np.mean(sum_rewards_per_ep[-100:])
            print(f"Episode {i+1} Average Reward (last 100): {avg_reward:.3f}")

        # PARTE 2: atualiza Q (e a política, implicitamente)
        Gt = 0
        for (s, a, r) in reversed(ep_trajectory):
            Gt = r + gamma*Gt
            delta = Gt - Q[s,a]
            Q[s,a] = Q[s,a] + lr * delta

    return sum_rewards_per_ep, Q

### Execução Off-Policy

In [42]:
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.01
    GAMMA = 0.830525147061507
    EPSILON = 0.05919712699520377

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOffP(env, EPISODES, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOffP-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    # plot_result(rewards, r_max_plot, None)

    # test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.820
Episode 300 Average Reward (last 100): -235.550
Episode 400 Average Reward (last 100): -504.020
Episode 500 Average Reward (last 100): -539.570
Episode 600 Average Reward (last 100): -594.830
Episode 700 Average Reward (last 100): -756.560
Episode 800 Average Reward (last 100): -936.470
Episode 900 Average Reward (last 100): -1169.300
Episode 1000 Average Reward (last 100): -1097.570
Episode 1100 Average Reward (last 100): -971.660
Episode 1200 Average Reward (last 100): -1042.580
Episode 1300 Average Reward (last 100): -738.920
Episode 1400 Average Reward (last 100): -738.020
Episode 1500 Average Reward (last 100): -845.750
Episode 1600 Average Reward (last 100): -720.650
Episode 1700 Average Reward (last 100): -773.840
Episode 1800 Average Reward (last 100): -666.470
Episode 1900 Average Reward (last 100): -451.730
Episode 2000 Average Reward (last 100): -559.550
Episode 2100 Average Rewar

### Execução On-Policy

#### Otimiza Parâmetros

In [57]:
from numpy.random.mtrand import gamma

import optuna

#from wrappers import DiscreteObservationWrapper


ENV = gym.make("Taxi-v3")


# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_values(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    lr = trial.suggest_uniform('learning_rate', 0.001, 1.0)
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    #bins1 = trial.suggest_int('bins1', 5, 100)
    #bins2 = trial.suggest_int('bins2', 5, 100)
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}")

    # roda o algoritmo e recebe os retornos não-descontados
    #env_wrapper = DiscreteObservationWrapper(ENV, [bins1,bins2])
    (returns, _) = run_montecarloOnP(ENV, 20000, lr, gamma, eps, render=False)
    return sum(returns[-100:])/100 

In [58]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_onpolice', 
                            load_if_exists=True)
study.optimize(train_values, n_trials=20) 

[32m[I 2022-10-01 05:40:30,145][0m Using an existing study with name 'new_MC_onpolice' instead of creating a new one.[0m
  lr = trial.suggest_uniform('learning_rate', 0.001, 1.0)
  gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
  eps = trial.suggest_uniform('epsilon', 0.01, 0.2)



TRIAL #1: eps=0.0944534808641455, gamma=0.14260763782298835
Episode 100 Average Reward (last 100): -486.920
Episode 200 Average Reward (last 100): -570.570
Episode 300 Average Reward (last 100): -444.350
Episode 400 Average Reward (last 100): -358.760
Episode 500 Average Reward (last 100): -301.610
Episode 600 Average Reward (last 100): -282.390
Episode 700 Average Reward (last 100): -265.430
Episode 800 Average Reward (last 100): -268.300
Episode 900 Average Reward (last 100): -251.750
Episode 1000 Average Reward (last 100): -271.670
Episode 1100 Average Reward (last 100): -257.330
Episode 1200 Average Reward (last 100): -270.470
Episode 1300 Average Reward (last 100): -258.230
Episode 1400 Average Reward (last 100): -254.540
Episode 1500 Average Reward (last 100): -279.830
Episode 1600 Average Reward (last 100): -273.890
Episode 1700 Average Reward (last 100): -257.640
Episode 1800 Average Reward (last 100): -251.610
Episode 1900 Average Reward (last 100): -256.200
Episode 2000 Aver

[32m[I 2022-10-01 05:41:01,593][0m Trial 1 finished with value: -255.65 and parameters: {'learning_rate': 0.900710732616076, 'gamma': 0.14260763782298835, 'epsilon': 0.0944534808641455}. Best is trial 1 with value: -255.65.[0m


Episode 20000 Average Reward (last 100): -255.650

TRIAL #2: eps=0.12202823895690514, gamma=0.9746717628447142
Episode 100 Average Reward (last 100): -495.200
Episode 200 Average Reward (last 100): -763.580
Episode 300 Average Reward (last 100): -721.100
Episode 400 Average Reward (last 100): -728.390
Episode 500 Average Reward (last 100): -705.390
Episode 600 Average Reward (last 100): -561.890
Episode 700 Average Reward (last 100): -647.030
Episode 800 Average Reward (last 100): -626.960
Episode 900 Average Reward (last 100): -673.850
Episode 1000 Average Reward (last 100): -600.680
Episode 1100 Average Reward (last 100): -771.320
Episode 1200 Average Reward (last 100): -668.270
Episode 1300 Average Reward (last 100): -604.910
Episode 1400 Average Reward (last 100): -633.620
Episode 1500 Average Reward (last 100): -649.370
Episode 1600 Average Reward (last 100): -550.670
Episode 1700 Average Reward (last 100): -540.200
Episode 1800 Average Reward (last 100): -567.130
Episode 1900 Ave

[32m[I 2022-10-01 05:41:30,422][0m Trial 2 finished with value: -567.68 and parameters: {'learning_rate': 0.8190269403881686, 'gamma': 0.9746717628447142, 'epsilon': 0.12202823895690514}. Best is trial 1 with value: -255.65.[0m



TRIAL #3: eps=0.1645264595438585, gamma=0.8319297384053119
Episode 100 Average Reward (last 100): -488.020
Episode 200 Average Reward (last 100): -928.100
Episode 300 Average Reward (last 100): -909.330
Episode 400 Average Reward (last 100): -789.670
Episode 500 Average Reward (last 100): -447.380
Episode 600 Average Reward (last 100): -490.050
Episode 700 Average Reward (last 100): -361.300
Episode 800 Average Reward (last 100): -361.370
Episode 900 Average Reward (last 100): -303.990
Episode 1000 Average Reward (last 100): -311.470
Episode 1100 Average Reward (last 100): -269.460
Episode 1200 Average Reward (last 100): -271.390
Episode 1300 Average Reward (last 100): -277.100
Episode 1400 Average Reward (last 100): -259.840
Episode 1500 Average Reward (last 100): -245.290
Episode 1600 Average Reward (last 100): -217.240
Episode 1700 Average Reward (last 100): -203.640
Episode 1800 Average Reward (last 100): -195.980
Episode 1900 Average Reward (last 100): -193.860
Episode 2000 Avera

[32m[I 2022-10-01 05:41:39,080][0m Trial 3 finished with value: -5.47 and parameters: {'learning_rate': 0.019082685103333705, 'gamma': 0.8319297384053119, 'epsilon': 0.1645264595438585}. Best is trial 3 with value: -5.47.[0m



TRIAL #4: eps=0.023196543828307038, gamma=0.9170193474911136
Episode 100 Average Reward (last 100): -295.040
Episode 200 Average Reward (last 100): -558.020
Episode 300 Average Reward (last 100): -1008.240
Episode 400 Average Reward (last 100): -1183.700
Episode 500 Average Reward (last 100): -984.980
Episode 600 Average Reward (last 100): -570.440
Episode 700 Average Reward (last 100): -471.620
Episode 800 Average Reward (last 100): -450.110
Episode 900 Average Reward (last 100): -320.150
Episode 1000 Average Reward (last 100): -298.190
Episode 1100 Average Reward (last 100): -267.500
Episode 1200 Average Reward (last 100): -223.670
Episode 1300 Average Reward (last 100): -222.320
Episode 1400 Average Reward (last 100): -232.220
Episode 1500 Average Reward (last 100): -254.540
Episode 1600 Average Reward (last 100): -214.490
Episode 1700 Average Reward (last 100): -213.860
Episode 1800 Average Reward (last 100): -213.590
Episode 1900 Average Reward (last 100): -231.320
Episode 2000 A

[32m[I 2022-10-01 05:42:09,305][0m Trial 4 finished with value: -212.33 and parameters: {'learning_rate': 0.2949031123870298, 'gamma': 0.9170193474911136, 'epsilon': 0.023196543828307038}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -212.330

TRIAL #5: eps=0.06866578564897574, gamma=0.08742960211556078
Episode 100 Average Reward (last 100): -430.940
Episode 200 Average Reward (last 100): -665.030
Episode 300 Average Reward (last 100): -538.940
Episode 400 Average Reward (last 100): -390.620
Episode 500 Average Reward (last 100): -293.150
Episode 600 Average Reward (last 100): -248.780
Episode 700 Average Reward (last 100): -260.120
Episode 800 Average Reward (last 100): -268.400
Episode 900 Average Reward (last 100): -265.700
Episode 1000 Average Reward (last 100): -244.820
Episode 1100 Average Reward (last 100): -259.310
Episode 1200 Average Reward (last 100): -244.280
Episode 1300 Average Reward (last 100): -237.080
Episode 1400 Average Reward (last 100): -243.200
Episode 1500 Average Reward (last 100): -255.530
Episode 1600 Average Reward (last 100): -239.870
Episode 1700 Average Reward (last 100): -250.850
Episode 1800 Average Reward (last 100): -242.210
Episode 1900 Av

[32m[I 2022-10-01 05:42:40,237][0m Trial 5 finished with value: -236.9 and parameters: {'learning_rate': 0.9607534518456106, 'gamma': 0.08742960211556078, 'epsilon': 0.06866578564897574}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -236.900

TRIAL #6: eps=0.057628292495876025, gamma=0.5155700847875877
Episode 100 Average Reward (last 100): -368.030
Episode 200 Average Reward (last 100): -708.500
Episode 300 Average Reward (last 100): -617.600
Episode 400 Average Reward (last 100): -390.800
Episode 500 Average Reward (last 100): -277.040
Episode 600 Average Reward (last 100): -236.090
Episode 700 Average Reward (last 100): -251.840
Episode 800 Average Reward (last 100): -242.300
Episode 900 Average Reward (last 100): -235.370
Episode 1000 Average Reward (last 100): -239.510
Episode 1100 Average Reward (last 100): -245.990
Episode 1200 Average Reward (last 100): -238.250
Episode 1300 Average Reward (last 100): -234.110
Episode 1400 Average Reward (last 100): -236.720
Episode 1500 Average Reward (last 100): -240.680
Episode 1600 Average Reward (last 100): -232.580
Episode 1700 Average Reward (last 100): -240.230
Episode 1800 Average Reward (last 100): -240.590
Episode 1900 Av

[32m[I 2022-10-01 05:43:09,910][0m Trial 6 finished with value: -224.8 and parameters: {'learning_rate': 0.4859808092770777, 'gamma': 0.5155700847875877, 'epsilon': 0.057628292495876025}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -224.800

TRIAL #7: eps=0.05389014551915402, gamma=0.3391136375915553
Episode 100 Average Reward (last 100): -433.760
Episode 200 Average Reward (last 100): -734.780
Episode 300 Average Reward (last 100): -848.450
Episode 400 Average Reward (last 100): -698.690
Episode 500 Average Reward (last 100): -532.370
Episode 600 Average Reward (last 100): -584.300
Episode 700 Average Reward (last 100): -530.390
Episode 800 Average Reward (last 100): -346.250
Episode 900 Average Reward (last 100): -324.650
Episode 1000 Average Reward (last 100): -322.850
Episode 1100 Average Reward (last 100): -294.410
Episode 1200 Average Reward (last 100): -257.510
Episode 1300 Average Reward (last 100): -259.670
Episode 1400 Average Reward (last 100): -300.800
Episode 1500 Average Reward (last 100): -262.340
Episode 1600 Average Reward (last 100): -270.980
Episode 1700 Average Reward (last 100): -237.230
Episode 1800 Average Reward (last 100): -246.750
Episode 1900 Ave

[32m[I 2022-10-01 05:43:37,601][0m Trial 7 finished with value: -195.77 and parameters: {'learning_rate': 0.03013050995348002, 'gamma': 0.3391136375915553, 'epsilon': 0.05389014551915402}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -195.770

TRIAL #8: eps=0.1571168067495486, gamma=0.16364895689752654
Episode 100 Average Reward (last 100): -434.620
Episode 200 Average Reward (last 100): -483.420
Episode 300 Average Reward (last 100): -369.650
Episode 400 Average Reward (last 100): -361.910
Episode 500 Average Reward (last 100): -312.860
Episode 600 Average Reward (last 100): -289.960
Episode 700 Average Reward (last 100): -288.470
Episode 800 Average Reward (last 100): -293.110
Episode 900 Average Reward (last 100): -293.910
Episode 1000 Average Reward (last 100): -283.310
Episode 1100 Average Reward (last 100): -296.210
Episode 1200 Average Reward (last 100): -290.100
Episode 1300 Average Reward (last 100): -292.070
Episode 1400 Average Reward (last 100): -286.800
Episode 1500 Average Reward (last 100): -290.030
Episode 1600 Average Reward (last 100): -285.210
Episode 1700 Average Reward (last 100): -289.730
Episode 1800 Average Reward (last 100): -285.760
Episode 1900 Ave

[32m[I 2022-10-01 05:44:08,801][0m Trial 8 finished with value: -286.74 and parameters: {'learning_rate': 0.7577433244921836, 'gamma': 0.16364895689752654, 'epsilon': 0.1571168067495486}. Best is trial 3 with value: -5.47.[0m



TRIAL #9: eps=0.04313238345767798, gamma=0.20118630438513252
Episode 100 Average Reward (last 100): -394.400
Episode 200 Average Reward (last 100): -777.260
Episode 300 Average Reward (last 100): -730.820
Episode 400 Average Reward (last 100): -466.310
Episode 500 Average Reward (last 100): -340.670
Episode 600 Average Reward (last 100): -331.400
Episode 700 Average Reward (last 100): -242.840
Episode 800 Average Reward (last 100): -251.300
Episode 900 Average Reward (last 100): -255.800
Episode 1000 Average Reward (last 100): -266.150
Episode 1100 Average Reward (last 100): -245.000
Episode 1200 Average Reward (last 100): -245.720
Episode 1300 Average Reward (last 100): -243.380
Episode 1400 Average Reward (last 100): -240.950
Episode 1500 Average Reward (last 100): -245.990
Episode 1600 Average Reward (last 100): -240.500
Episode 1700 Average Reward (last 100): -229.340
Episode 1800 Average Reward (last 100): -242.840
Episode 1900 Average Reward (last 100): -223.850
Episode 2000 Ave

[32m[I 2022-10-01 05:44:39,091][0m Trial 9 finished with value: -224.3 and parameters: {'learning_rate': 0.13876194778139583, 'gamma': 0.20118630438513252, 'epsilon': 0.04313238345767798}. Best is trial 3 with value: -5.47.[0m



TRIAL #10: eps=0.10992769148416247, gamma=0.7823562041237935
Episode 100 Average Reward (last 100): -514.550
Episode 200 Average Reward (last 100): -571.430
Episode 300 Average Reward (last 100): -613.640
Episode 400 Average Reward (last 100): -404.840
Episode 500 Average Reward (last 100): -321.140
Episode 600 Average Reward (last 100): -275.510
Episode 700 Average Reward (last 100): -289.460
Episode 800 Average Reward (last 100): -287.030
Episode 900 Average Reward (last 100): -278.660
Episode 1000 Average Reward (last 100): -264.890
Episode 1100 Average Reward (last 100): -264.170
Episode 1200 Average Reward (last 100): -269.570
Episode 1300 Average Reward (last 100): -265.380
Episode 1400 Average Reward (last 100): -262.540
Episode 1500 Average Reward (last 100): -273.190
Episode 1600 Average Reward (last 100): -263.000
Episode 1700 Average Reward (last 100): -268.130
Episode 1800 Average Reward (last 100): -265.700
Episode 1900 Average Reward (last 100): -264.620
Episode 2000 Ave

[32m[I 2022-10-01 05:45:04,181][0m Trial 10 finished with value: -195.64 and parameters: {'learning_rate': 0.3341277612575944, 'gamma': 0.7823562041237935, 'epsilon': 0.10992769148416247}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -195.640

TRIAL #11: eps=0.19491280270154676, gamma=0.670467372440557
Episode 100 Average Reward (last 100): -477.200
Episode 200 Average Reward (last 100): -497.770
Episode 300 Average Reward (last 100): -356.690
Episode 400 Average Reward (last 100): -325.300
Episode 500 Average Reward (last 100): -316.140
Episode 600 Average Reward (last 100): -315.620
Episode 700 Average Reward (last 100): -320.060
Episode 800 Average Reward (last 100): -308.120
Episode 900 Average Reward (last 100): -320.830
Episode 1000 Average Reward (last 100): -312.620
Episode 1100 Average Reward (last 100): -308.920
Episode 1200 Average Reward (last 100): -304.790
Episode 1300 Average Reward (last 100): -308.530
Episode 1400 Average Reward (last 100): -297.160
Episode 1500 Average Reward (last 100): -297.970
Episode 1600 Average Reward (last 100): -307.330
Episode 1700 Average Reward (last 100): -296.620
Episode 1800 Average Reward (last 100): -303.870
Episode 1900 Ave

[32m[I 2022-10-01 05:45:33,779][0m Trial 11 finished with value: -276.67 and parameters: {'learning_rate': 0.6167622153987133, 'gamma': 0.670467372440557, 'epsilon': 0.19491280270154676}. Best is trial 3 with value: -5.47.[0m


Episode 19900 Average Reward (last 100): -299.940
Episode 20000 Average Reward (last 100): -276.670

TRIAL #12: eps=0.1475094009314478, gamma=0.7454728148336303
Episode 100 Average Reward (last 100): -400.290
Episode 200 Average Reward (last 100): -616.880
Episode 300 Average Reward (last 100): -422.030
Episode 400 Average Reward (last 100): -339.610
Episode 500 Average Reward (last 100): -339.300
Episode 600 Average Reward (last 100): -301.360
Episode 700 Average Reward (last 100): -295.940
Episode 800 Average Reward (last 100): -301.180
Episode 900 Average Reward (last 100): -290.990
Episode 1000 Average Reward (last 100): -294.440
Episode 1100 Average Reward (last 100): -282.600
Episode 1200 Average Reward (last 100): -281.270
Episode 1300 Average Reward (last 100): -284.050
Episode 1400 Average Reward (last 100): -282.350
Episode 1500 Average Reward (last 100): -278.290
Episode 1600 Average Reward (last 100): -271.450
Episode 1700 Average Reward (last 100): -279.990
Episode 1800 Av

[32m[I 2022-10-01 05:45:59,862][0m Trial 12 finished with value: -238.45 and parameters: {'learning_rate': 0.3107231987821249, 'gamma': 0.7454728148336303, 'epsilon': 0.1475094009314478}. Best is trial 3 with value: -5.47.[0m



TRIAL #13: eps=0.19521676032844495, gamma=0.7752193595482318
Episode 100 Average Reward (last 100): -470.050
Episode 200 Average Reward (last 100): -659.020
Episode 300 Average Reward (last 100): -427.120
Episode 400 Average Reward (last 100): -364.310
Episode 500 Average Reward (last 100): -286.310
Episode 600 Average Reward (last 100): -301.230
Episode 700 Average Reward (last 100): -319.610
Episode 800 Average Reward (last 100): -306.250
Episode 900 Average Reward (last 100): -278.070
Episode 1000 Average Reward (last 100): -289.870
Episode 1100 Average Reward (last 100): -277.340
Episode 1200 Average Reward (last 100): -258.010
Episode 1300 Average Reward (last 100): -267.900
Episode 1400 Average Reward (last 100): -261.670
Episode 1500 Average Reward (last 100): -216.340
Episode 1600 Average Reward (last 100): -246.230
Episode 1700 Average Reward (last 100): -273.300
Episode 1800 Average Reward (last 100): -272.470
Episode 1900 Average Reward (last 100): -261.680
Episode 2000 Ave

[32m[I 2022-10-01 05:46:23,112][0m Trial 13 finished with value: -254.17 and parameters: {'learning_rate': 0.2670938501996128, 'gamma': 0.7752193595482318, 'epsilon': 0.19521676032844495}. Best is trial 3 with value: -5.47.[0m



TRIAL #14: eps=0.11593709525873803, gamma=0.5709036059057223
Episode 100 Average Reward (last 100): -451.910
Episode 200 Average Reward (last 100): -586.840
Episode 300 Average Reward (last 100): -399.480
Episode 400 Average Reward (last 100): -284.060
Episode 500 Average Reward (last 100): -301.570
Episode 600 Average Reward (last 100): -281.810
Episode 700 Average Reward (last 100): -271.730
Episode 800 Average Reward (last 100): -293.780
Episode 900 Average Reward (last 100): -270.210
Episode 1000 Average Reward (last 100): -275.590
Episode 1100 Average Reward (last 100): -268.850
Episode 1200 Average Reward (last 100): -264.350
Episode 1300 Average Reward (last 100): -270.830
Episode 1400 Average Reward (last 100): -267.980
Episode 1500 Average Reward (last 100): -265.140
Episode 1600 Average Reward (last 100): -270.560
Episode 1700 Average Reward (last 100): -270.750
Episode 1800 Average Reward (last 100): -280.550
Episode 1900 Average Reward (last 100): -267.680
Episode 2000 Ave

[32m[I 2022-10-01 05:46:52,752][0m Trial 14 finished with value: -242.13 and parameters: {'learning_rate': 0.47928497218870214, 'gamma': 0.5709036059057223, 'epsilon': 0.11593709525873803}. Best is trial 3 with value: -5.47.[0m


Episode 20000 Average Reward (last 100): -242.130

TRIAL #15: eps=0.1581824425148939, gamma=0.84832604567943
Episode 100 Average Reward (last 100): -545.420
Episode 200 Average Reward (last 100): -847.380
Episode 300 Average Reward (last 100): -958.820
Episode 400 Average Reward (last 100): -670.880
Episode 500 Average Reward (last 100): -442.360
Episode 600 Average Reward (last 100): -360.680
Episode 700 Average Reward (last 100): -340.890
Episode 800 Average Reward (last 100): -328.870
Episode 900 Average Reward (last 100): -313.390
Episode 1000 Average Reward (last 100): -288.320
Episode 1100 Average Reward (last 100): -276.960
Episode 1200 Average Reward (last 100): -276.530
Episode 1300 Average Reward (last 100): -272.150
Episode 1400 Average Reward (last 100): -270.630
Episode 1500 Average Reward (last 100): -264.320
Episode 1600 Average Reward (last 100): -262.650
Episode 1700 Average Reward (last 100): -267.320
Episode 1800 Average Reward (last 100): -250.180
Episode 1900 Avera

[32m[I 2022-10-01 05:47:02,568][0m Trial 15 finished with value: -4.93 and parameters: {'learning_rate': 0.03526887216220079, 'gamma': 0.84832604567943, 'epsilon': 0.1581824425148939}. Best is trial 15 with value: -4.93.[0m


Episode 19000 Average Reward (last 100): -2.590
Episode 19100 Average Reward (last 100): -4.340
Episode 19200 Average Reward (last 100): -4.150
Episode 19300 Average Reward (last 100): -4.040
Episode 19400 Average Reward (last 100): -2.970
Episode 19500 Average Reward (last 100): -2.320
Episode 19600 Average Reward (last 100): -3.400
Episode 19700 Average Reward (last 100): -6.210
Episode 19800 Average Reward (last 100): -2.970
Episode 19900 Average Reward (last 100): -3.630
Episode 20000 Average Reward (last 100): -4.930

TRIAL #16: eps=0.1594319424288697, gamma=0.8762195844017038
Episode 100 Average Reward (last 100): -438.700
Episode 200 Average Reward (last 100): -949.920
Episode 300 Average Reward (last 100): -941.600
Episode 400 Average Reward (last 100): -674.470
Episode 500 Average Reward (last 100): -619.940
Episode 600 Average Reward (last 100): -533.060
Episode 700 Average Reward (last 100): -456.240
Episode 800 Average Reward (last 100): -413.060
Episode 900 Average Reward 

[32m[I 2022-10-01 05:47:09,216][0m Trial 16 finished with value: -2.2 and parameters: {'learning_rate': 0.007617314337157292, 'gamma': 0.8762195844017038, 'epsilon': 0.1594319424288697}. Best is trial 16 with value: -2.2.[0m



TRIAL #17: eps=0.1400850145348025, gamma=0.39058011321342107
Episode 100 Average Reward (last 100): -433.730
Episode 200 Average Reward (last 100): -585.700
Episode 300 Average Reward (last 100): -416.900
Episode 400 Average Reward (last 100): -312.410
Episode 500 Average Reward (last 100): -302.070
Episode 600 Average Reward (last 100): -292.860
Episode 700 Average Reward (last 100): -287.290
Episode 800 Average Reward (last 100): -303.770
Episode 900 Average Reward (last 100): -278.660
Episode 1000 Average Reward (last 100): -287.300
Episode 1100 Average Reward (last 100): -274.650
Episode 1200 Average Reward (last 100): -265.110
Episode 1300 Average Reward (last 100): -275.430
Episode 1400 Average Reward (last 100): -282.270
Episode 1500 Average Reward (last 100): -262.770
Episode 1600 Average Reward (last 100): -261.100
Episode 1700 Average Reward (last 100): -259.630
Episode 1800 Average Reward (last 100): -261.180
Episode 1900 Average Reward (last 100): -252.890
Episode 2000 Ave

[32m[I 2022-10-01 05:47:37,394][0m Trial 17 finished with value: -248.67 and parameters: {'learning_rate': 0.13114461623714968, 'gamma': 0.39058011321342107, 'epsilon': 0.1400850145348025}. Best is trial 16 with value: -2.2.[0m



TRIAL #18: eps=0.17483556367299188, gamma=0.6389731969911425
Episode 100 Average Reward (last 100): -481.220
Episode 200 Average Reward (last 100): -627.770
Episode 300 Average Reward (last 100): -439.540
Episode 400 Average Reward (last 100): -315.650
Episode 500 Average Reward (last 100): -305.390
Episode 600 Average Reward (last 100): -324.030
Episode 700 Average Reward (last 100): -307.100
Episode 800 Average Reward (last 100): -303.590
Episode 900 Average Reward (last 100): -296.920
Episode 1000 Average Reward (last 100): -293.230
Episode 1100 Average Reward (last 100): -296.940
Episode 1200 Average Reward (last 100): -302.740
Episode 1300 Average Reward (last 100): -297.580
Episode 1400 Average Reward (last 100): -277.620
Episode 1500 Average Reward (last 100): -263.980
Episode 1600 Average Reward (last 100): -261.300
Episode 1700 Average Reward (last 100): -253.800
Episode 1800 Average Reward (last 100): -251.500
Episode 1900 Average Reward (last 100): -277.050
Episode 2000 Ave

[32m[I 2022-10-01 05:48:02,320][0m Trial 18 finished with value: -190.34 and parameters: {'learning_rate': 0.16568977589402378, 'gamma': 0.6389731969911425, 'epsilon': 0.17483556367299188}. Best is trial 16 with value: -2.2.[0m


Episode 20000 Average Reward (last 100): -190.340

TRIAL #19: eps=0.0859457757172045, gamma=0.8997568520596844
Episode 100 Average Reward (last 100): -453.620
Episode 200 Average Reward (last 100): -870.580
Episode 300 Average Reward (last 100): -950.240
Episode 400 Average Reward (last 100): -848.720
Episode 500 Average Reward (last 100): -625.430
Episode 600 Average Reward (last 100): -364.560
Episode 700 Average Reward (last 100): -390.800
Episode 800 Average Reward (last 100): -408.260
Episode 900 Average Reward (last 100): -352.090
Episode 1000 Average Reward (last 100): -308.900
Episode 1100 Average Reward (last 100): -284.150
Episode 1200 Average Reward (last 100): -275.330
Episode 1300 Average Reward (last 100): -262.280
Episode 1400 Average Reward (last 100): -266.240
Episode 1500 Average Reward (last 100): -250.220
Episode 1600 Average Reward (last 100): -251.480
Episode 1700 Average Reward (last 100): -251.120
Episode 1800 Average Reward (last 100): -257.240
Episode 1900 Ave

[32m[I 2022-10-01 05:48:26,090][0m Trial 19 finished with value: -108.83 and parameters: {'learning_rate': 0.02909295418926345, 'gamma': 0.8997568520596844, 'epsilon': 0.0859457757172045}. Best is trial 16 with value: -2.2.[0m



TRIAL #20: eps=0.12834449095225414, gamma=0.9996571995152939
Episode 100 Average Reward (last 100): -490.070
Episode 200 Average Reward (last 100): -1149.320
Episode 300 Average Reward (last 100): -939.210
Episode 400 Average Reward (last 100): -714.800
Episode 500 Average Reward (last 100): -743.690
Episode 600 Average Reward (last 100): -804.980
Episode 700 Average Reward (last 100): -869.350
Episode 800 Average Reward (last 100): -786.330
Episode 900 Average Reward (last 100): -773.360
Episode 1000 Average Reward (last 100): -847.120
Episode 1100 Average Reward (last 100): -680.320
Episode 1200 Average Reward (last 100): -690.720
Episode 1300 Average Reward (last 100): -736.000
Episode 1400 Average Reward (last 100): -736.290
Episode 1500 Average Reward (last 100): -719.450
Episode 1600 Average Reward (last 100): -805.750
Episode 1700 Average Reward (last 100): -601.190
Episode 1800 Average Reward (last 100): -651.910
Episode 1900 Average Reward (last 100): -683.290
Episode 2000 Av

[32m[I 2022-10-01 05:48:49,600][0m Trial 20 finished with value: -620.31 and parameters: {'learning_rate': 0.3985605525630849, 'gamma': 0.9996571995152939, 'epsilon': 0.12834449095225414}. Best is trial 16 with value: -2.2.[0m


#### Execução

In [61]:
ENV_NAME = "Taxi-v3"  
#ENV_NAME = "MountainCarContinuous-v0"  
#ENV_NAME = "LunarLander-v2"  
env = gym.make(ENV_NAME)

#parameters: {'learning_rate': 0.007617314337157292, 'gamma': 0.8762195844017038, 'epsilon': 0.1594319424288697}. Best is trial 16 with value: -2.2.
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.007617314337157292
    GAMMA = 0.8762195844017038
    EPSILON = 0.1594319424288697

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOnP(env, EPISODES, LR, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOnP-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    # plot_result(rewards, r_max_plot, None)

    # test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

Episode 100 Average Reward (last 100): -549.290
Episode 200 Average Reward (last 100): -922.890
Episode 300 Average Reward (last 100): -931.770
Episode 400 Average Reward (last 100): -736.940
Episode 500 Average Reward (last 100): -629.520
Episode 600 Average Reward (last 100): -471.410
Episode 700 Average Reward (last 100): -440.600
Episode 800 Average Reward (last 100): -379.740
Episode 900 Average Reward (last 100): -354.050
Episode 1000 Average Reward (last 100): -307.790
Episode 1100 Average Reward (last 100): -316.920
Episode 1200 Average Reward (last 100): -268.470
Episode 1300 Average Reward (last 100): -230.530
Episode 1400 Average Reward (last 100): -222.450
Episode 1500 Average Reward (last 100): -194.800
Episode 1600 Average Reward (last 100): -196.820
Episode 1700 Average Reward (last 100): -177.170
Episode 1800 Average Reward (last 100): -179.720
Episode 1900 Average Reward (last 100): -145.470
Episode 2000 Average Reward (last 100): -124.170
Episode 2100 Average Reward (

### Contínuo

In [44]:
#ENV_NAME = "Taxi-v3"  
ENV_NAME = "MountainCarContinuous-v0"  
#ENV_NAME = "LunarLander-v2"  
env = gym.make(ENV_NAME)

In [45]:
from numpy.random.mtrand import gamma

import optuna

#from wrappers import DiscreteObservationWrapper


ENV = gym.make("Taxi-v3")


# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_values(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    #bins1 = trial.suggest_int('bins1', 5, 100)
    #bins2 = trial.suggest_int('bins2', 5, 100)
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}")

    # roda o algoritmo e recebe os retornos não-descontados
    #env_wrapper = DiscreteObservationWrapper(ENV, [bins1,bins2])
    (returns, _) = run_montecarloOP(env, 20000, gamma, eps, render=False)
    return sum(returns[-100:])/100

In [None]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice_cont', 
                            load_if_exists=True)
study.optimize(train_values, n_trials=20) 

In [None]:
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.01
    GAMMA = 0.95
    EPSILON = 0.1

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOffP(env, EPISODES, LR, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarlo2-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot, None)

    test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

### Exibe e avalia o agente

In [18]:
record_video(ENV_NAME, model, video_length=1000, prefix='monte-car-off-police')
show_videos('videos', prefix='monte-car-off-police')

NameError: name 'model' is not defined

In [None]:
#mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=30)

#print(f"Retorno médio: {mean_reward:.2f} +/- {std_reward:.2f}")