# Monte-Carlo Off-Police

### Instalação de pacotes

In [9]:
from IPython.display import clear_output
!apt-get install ffmpeg freeglut3-dev xvfb  
!pip install gym[all]==00.25.1
!pip install gym[atari,accept-rom-license]==00.25.1
!pip install pyglet
!pip install stable-baselines3[extra]
!pip install optuna
clear_output()

In [10]:
!mkdir log_project

mkdir: cannot create directory ‘log_project’: File exists


### Imports


In [11]:
import gym
import numpy as np
import tensorboard
import matplotlib.pyplot as plt
import time

%load_ext tensorboard

import gym
import numpy as np


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


### Para salvar vídeo

In [12]:
# Set up fake display; otherwise rendering will fail
import os
!pip install stable-baselines3[extra]
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


A gravação é feita com o wrapper [VecVideoRecorder](https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html#vecvideorecorder).

In [13]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record the given number of steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [14]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [60]:
# ideias adaptadas de : https://www.anyscale.com/blog/an-introduction-to-reinforcement-learning-with-openai-gym-rllib-and-google
from base64 import b64encode
from IPython.display import HTML
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  html_code = f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'
  return HTML(html_code)

# Código

In [15]:
ENV_NAME = "Taxi-v3"  
 
env = gym.make(ENV_NAME)

## Off-policy

In [41]:
def choose_action(Q, state):
    return np.argmax(Q[state])

def choose_actionB(num_actions):
    return np.random.randint(0, num_actions)

# Algoritmo Monte-Carlo de Controle, variante "toda-visita".
# Atenção: os espaços de estados e de ações precisam ser discretos, dados por valores inteiros
def run_montecarloOffP(env, episodes, gamma=0.95, epsilon=0.1, render=False):
    assert isinstance(env.observation_space, gym.spaces.Discrete)
    assert isinstance(env.action_space, gym.spaces.Discrete)
    
    num_actions = env.action_space.n
    
    # inicializa a tabela Q toda com zero,
    # usar o estado como índice das linhas e a ação como índice das colunas
    Q = np.zeros(shape = (env.observation_space.n, num_actions))
    C = np.zeros(shape = (env.observation_space.n, num_actions))

    # para cada episódio, guarda sua soma de recompensas (retorno não-discontado)
    sum_rewards_per_ep = []

    # loop principal
    for i in range(episodes):
        done = False
        sum_rewards, reward = 0, 0
        ep_trajectory = []
        
        state = env.reset()
    
        # PARTE 1: executa um episódio completo
        while done != True:   
            # exibe/renderiza os passos no ambiente, durante 1 episódio a cada mil e também nos últimos 5 episódios 
            if render and (i >= (episodes - 5) or (i+1) % 1000 == 0):
                env.render()
                
            # escolhe a próxima ação -- usa epsilon-greedy
            action = choose_action(Q, state)
        
            # realiza a ação, ou seja, dá um passo no ambiente
            next_state, reward, done, _ = env.step(action)
            
            # adiciona a tripla que representa este passo
            ep_trajectory.append( (state, action, reward) )
            
            sum_rewards += reward
            state = next_state
        
        sum_rewards_per_ep.append(sum_rewards)

        # a cada 100 episódios, imprime informação sobre o progresso 
        if (i+1) % 100 == 0:
            avg_reward = np.mean(sum_rewards_per_ep[-100:])
            print(f"Episode {i+1} Average Reward (last 100): {avg_reward:.3f}")

        # PARTE 2: atualiza Q (e a política, implicitamente)
        Gt = 0
        W = 1
        for (s, a, r) in reversed(ep_trajectory):
            Gt = r + gamma*Gt
            C[s,a] = C[s,a] + W
            delta = W * (Gt - Q[s,a])
            Q[s,a] = Q[s,a] + (1/C[s,a])* delta
            best = choose_action(Q,s)
            if best != choose_actionB(num_actions): break
            W = W*(1/(1/num_actions))

    return sum_rewards_per_ep, Q


## On-Policy

In [34]:
# Esta é a política. Neste caso, escolhe uma ação com base nos valores
# da tabela Q, usando uma estratégia epsilon-greedy.
def pi_policy(Q, state, num_actions, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(0, num_actions)
    else:
        return np.argmax(Q[state])


# Algoritmo Monte-Carlo de Controle, variante "toda-visita".
# Atenção: os espaços de estados e de ações precisam ser discretos, dados por valores inteiros
def run_montecarloOnP(env, episodes, lr=0.1, gamma=0.95, epsilon=0.1, render=False):
    assert isinstance(env.observation_space, gym.spaces.Discrete)
    assert isinstance(env.action_space, gym.spaces.Discrete)
    
    num_actions = env.action_space.n
    
    # inicializa a tabela Q toda com zero,
    # usar o estado como índice das linhas e a ação como índice das colunas
    Q = np.zeros(shape = (env.observation_space.n, num_actions))

    # para cada episódio, guarda sua soma de recompensas (retorno não-discontado)
    sum_rewards_per_ep = []

    # loop principal
    for i in range(episodes):
        done = False
        sum_rewards, reward = 0, 0
        ep_trajectory = []
        
        state = env.reset()
    
        # PARTE 1: executa um episódio completo
        while done != True:   
            # exibe/renderiza os passos no ambiente, durante 1 episódio a cada mil e também nos últimos 5 episódios 
            if render and (i >= (episodes - 5) or (i+1) % 1000 == 0):
                env.render()
                
            # escolhe a próxima ação -- usa epsilon-greedy
            action = pi_policy(Q, state, num_actions, epsilon)
        
            # realiza a ação, ou seja, dá um passo no ambiente
            next_state, reward, done, _ = env.step(action)
            
            # adiciona a tripla que representa este passo
            ep_trajectory.append( (state, action, reward) )
            
            sum_rewards += reward
            state = next_state
        
        sum_rewards_per_ep.append(sum_rewards)

        # a cada 100 episódios, imprime informação sobre o progresso 
        if (i+1) % 100 == 0:
            avg_reward = np.mean(sum_rewards_per_ep[-100:])
            print(f"Episode {i+1} Average Reward (last 100): {avg_reward:.3f}")

        # PARTE 2: atualiza Q (e a política, implicitamente)
        Gt = 0
        for (s, a, r) in reversed(ep_trajectory):
            Gt = r + gamma*Gt
            delta = Gt - Q[s,a]
            Q[s,a] = Q[s,a] + lr * delta

    return sum_rewards_per_ep, Q

### Execução Off-Policy

### Otimiza Parâmetros

In [18]:
from numpy.random.mtrand import gamma

import optuna



ENV = gym.make("Taxi-v3")


# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_values(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    #bins1 = trial.suggest_int('bins1', 5, 100)
    #bins2 = trial.suggest_int('bins2', 5, 100)
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}")

    # roda o algoritmo e recebe os retornos não-descontados
    #env_wrapper = DiscreteObservationWrapper(ENV, [bins1,bins2])
    (returns, _) = run_montecarloOffP(env, 20000, gamma, eps, render=False)
    return sum(returns[-100:])/100 

In [19]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice', 
                            load_if_exists=True)
study.optimize(train_values, n_trials=20) 

[32m[I 2022-10-02 20:05:44,145][0m Using an existing study with name 'new_MC_offpolice' instead of creating a new one.[0m
  from ipykernel import kernelapp as app
  app.launch_new_instance()



TRIAL #1: eps=0.16324236563466543, gamma=0.579879263191128
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -289.730
Episode 300 Average Reward (last 100): -307.460
Episode 400 Average Reward (last 100): -468.560
Episode 500 Average Reward (last 100): -576.020
Episode 600 Average Reward (last 100): -827.930
Episode 700 Average Reward (last 100): -827.930
Episode 800 Average Reward (last 100): -954.470
Episode 900 Average Reward (last 100): -864.290
Episode 1000 Average Reward (last 100): -954.560
Episode 1100 Average Reward (last 100): -1080.110
Episode 1200 Average Reward (last 100): -954.560
Episode 1300 Average Reward (last 100): -810.110
Episode 1400 Average Reward (last 100): -809.930
Episode 1500 Average Reward (last 100): -737.390
Episode 1600 Average Reward (last 100): -756.380
Episode 1700 Average Reward (last 100): -559.010
Episode 1800 Average Reward (last 100): -666.650
Episode 1900 Average Reward (last 100): -522.830
Episode 2000 Aver

[32m[I 2022-10-02 20:06:48,326][0m Trial 1 finished with value: -200.0 and parameters: {'gamma': 0.579879263191128, 'epsilon': 0.16324236563466543}. Best is trial 1 with value: -200.0.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #2: eps=0.04986517974023241, gamma=0.35524532169751455
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -289.190
Episode 300 Average Reward (last 100): -360.290
Episode 400 Average Reward (last 100): -432.380
Episode 500 Average Reward (last 100): -630.290
Episode 600 Average Reward (last 100): -703.280
Episode 700 Average Reward (last 100): -916.850
Episode 800 Average Reward (last 100): -666.830
Episode 900 Average Reward (last 100): -882.200
Episode 1000 Average Reward (last 100): -1044.020
Episode 1100 Average Reward (last 100): -898.670
Episode 1200 Average Reward (last 100): -1043.390
Episode 1300 Average Reward (last 100): -899.930
Episode 1400 Average Reward (last 100): -864.470
Episode 1500 Average Reward (last 100): -792.020
Episode 1600 Average Reward (last 100): -666.560
Episode 1700 Average Reward (last 100): -630.290
Episode 1800 Average Reward (last 100): -486.650
Episode 1900 

[32m[I 2022-10-02 20:07:43,611][0m Trial 2 finished with value: -217.64 and parameters: {'gamma': 0.35524532169751455, 'epsilon': 0.04986517974023241}. Best is trial 1 with value: -200.0.[0m


Episode 20000 Average Reward (last 100): -217.640

TRIAL #3: eps=0.19938233226747074, gamma=0.48545617660023727
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -432.740
Episode 400 Average Reward (last 100): -522.740
Episode 500 Average Reward (last 100): -683.480
Episode 600 Average Reward (last 100): -666.020
Episode 700 Average Reward (last 100): -864.290
Episode 800 Average Reward (last 100): -953.930
Episode 900 Average Reward (last 100): -882.290
Episode 1000 Average Reward (last 100): -1062.650
Episode 1100 Average Reward (last 100): -936.380
Episode 1200 Average Reward (last 100): -918.110
Episode 1300 Average Reward (last 100): -900.470
Episode 1400 Average Reward (last 100): -864.560
Episode 1500 Average Reward (last 100): -774.020
Episode 1600 Average Reward (last 100): -683.570
Episode 1700 Average Reward (last 100): -648.830
Episode 1800 Average Reward (last 100): -666.830
Episode 1900 A

[32m[I 2022-10-02 20:08:38,579][0m Trial 3 finished with value: -200.0 and parameters: {'gamma': 0.48545617660023727, 'epsilon': 0.19938233226747074}. Best is trial 1 with value: -200.0.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #4: eps=0.07343566143755513, gamma=0.28972660100270037
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -253.370
Episode 300 Average Reward (last 100): -289.010
Episode 400 Average Reward (last 100): -522.470
Episode 500 Average Reward (last 100): -613.460
Episode 600 Average Reward (last 100): -917.750
Episode 700 Average Reward (last 100): -721.460
Episode 800 Average Reward (last 100): -827.750
Episode 900 Average Reward (last 100): -918.200
Episode 1000 Average Reward (last 100): -1026.560
Episode 1100 Average Reward (last 100): -880.940
Episode 1200 Average Reward (last 100): -738.740
Episode 1300 Average Reward (last 100): -882.560
Episode 1400 Average Reward (last 100): -1043.930
Episode 1500 Average Reward (last 100): -756.920
Episode 1600 Average Reward (last 100): -900.380
Episode 1700 Average Reward (last 100): -685.010
Episode 1800 Average Reward (last 100): -630.650
Episode 1900 

[32m[I 2022-10-02 20:09:33,867][0m Trial 4 finished with value: -215.4 and parameters: {'gamma': 0.28972660100270037, 'epsilon': 0.07343566143755513}. Best is trial 1 with value: -200.0.[0m


Episode 20000 Average Reward (last 100): -215.400

TRIAL #5: eps=0.14072807551338523, gamma=0.05846762765435902
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -218.000
Episode 300 Average Reward (last 100): -236.000
Episode 400 Average Reward (last 100): -649.370
Episode 500 Average Reward (last 100): -486.650
Episode 600 Average Reward (last 100): -666.650
Episode 700 Average Reward (last 100): -935.030
Episode 800 Average Reward (last 100): -936.470
Episode 900 Average Reward (last 100): -1133.750
Episode 1000 Average Reward (last 100): -972.470
Episode 1100 Average Reward (last 100): -988.130
Episode 1200 Average Reward (last 100): -846.740
Episode 1300 Average Reward (last 100): -954.650
Episode 1400 Average Reward (last 100): -685.190
Episode 1500 Average Reward (last 100): -649.190
Episode 1600 Average Reward (last 100): -684.290
Episode 1700 Average Reward (last 100): -523.100
Episode 1800 Average Reward (last 100): -631.100
Episode 1900 A

[32m[I 2022-10-02 20:10:28,541][0m Trial 5 finished with value: -200.0 and parameters: {'gamma': 0.05846762765435902, 'epsilon': 0.14072807551338523}. Best is trial 1 with value: -200.0.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #6: eps=0.12050481854353139, gamma=0.648253946281938
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -271.550
Episode 300 Average Reward (last 100): -306.830
Episode 400 Average Reward (last 100): -485.660
Episode 500 Average Reward (last 100): -736.670
Episode 600 Average Reward (last 100): -703.010
Episode 700 Average Reward (last 100): -880.940
Episode 800 Average Reward (last 100): -899.660
Episode 900 Average Reward (last 100): -881.840
Episode 1000 Average Reward (last 100): -899.480
Episode 1100 Average Reward (last 100): -1133.750
Episode 1200 Average Reward (last 100): -864.020
Episode 1300 Average Reward (last 100): -918.200
Episode 1400 Average Reward (last 100): -738.200
Episode 1500 Average Reward (last 100): -774.380
Episode 1600 Average Reward (last 100): -809.390
Episode 1700 Average Reward (last 100): -702.110
Episode 1800 Average Reward (last 100): -577.010
Episode 1900 Ave

[32m[I 2022-10-02 20:11:22,992][0m Trial 6 finished with value: -189.47 and parameters: {'gamma': 0.648253946281938, 'epsilon': 0.12050481854353139}. Best is trial 6 with value: -189.47.[0m


Episode 20000 Average Reward (last 100): -189.470

TRIAL #7: eps=0.013823582642653015, gamma=0.38913662678825733
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.910
Episode 300 Average Reward (last 100): -343.190
Episode 400 Average Reward (last 100): -414.470
Episode 500 Average Reward (last 100): -449.930
Episode 600 Average Reward (last 100): -701.660
Episode 700 Average Reward (last 100): -880.940
Episode 800 Average Reward (last 100): -954.560
Episode 900 Average Reward (last 100): -953.570
Episode 1000 Average Reward (last 100): -972.470
Episode 1100 Average Reward (last 100): -1097.210
Episode 1200 Average Reward (last 100): -990.380
Episode 1300 Average Reward (last 100): -972.200
Episode 1400 Average Reward (last 100): -917.930
Episode 1500 Average Reward (last 100): -828.020
Episode 1600 Average Reward (last 100): -773.750
Episode 1700 Average Reward (last 100): -684.740
Episode 1800 Average Reward (last 100): -379.370
Episode 1900 

[32m[I 2022-10-02 20:12:16,395][0m Trial 7 finished with value: -204.89 and parameters: {'gamma': 0.38913662678825733, 'epsilon': 0.013823582642653015}. Best is trial 6 with value: -189.47.[0m


Episode 20000 Average Reward (last 100): -204.890

TRIAL #8: eps=0.04744630364193296, gamma=0.3773829324617836
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -253.460
Episode 300 Average Reward (last 100): -395.840
Episode 400 Average Reward (last 100): -343.010
Episode 500 Average Reward (last 100): -630.020
Episode 600 Average Reward (last 100): -630.650
Episode 700 Average Reward (last 100): -810.740
Episode 800 Average Reward (last 100): -1026.470
Episode 900 Average Reward (last 100): -1008.830
Episode 1000 Average Reward (last 100): -1025.930
Episode 1100 Average Reward (last 100): -793.370
Episode 1200 Average Reward (last 100): -900.650
Episode 1300 Average Reward (last 100): -846.650
Episode 1400 Average Reward (last 100): -918.380
Episode 1500 Average Reward (last 100): -774.290
Episode 1600 Average Reward (last 100): -953.750
Episode 1700 Average Reward (last 100): -702.740
Episode 1800 Average Reward (last 100): -415.460
Episode 1900 

[32m[I 2022-10-02 20:13:11,445][0m Trial 8 finished with value: -200.0 and parameters: {'gamma': 0.3773829324617836, 'epsilon': 0.04744630364193296}. Best is trial 6 with value: -189.47.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #9: eps=0.038578676730835204, gamma=0.7947988931592417
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.370
Episode 300 Average Reward (last 100): -289.460
Episode 400 Average Reward (last 100): -504.470
Episode 500 Average Reward (last 100): -558.470
Episode 600 Average Reward (last 100): -810.110
Episode 700 Average Reward (last 100): -756.830
Episode 800 Average Reward (last 100): -1043.840
Episode 900 Average Reward (last 100): -955.010
Episode 1000 Average Reward (last 100): -918.020
Episode 1100 Average Reward (last 100): -1078.940
Episode 1200 Average Reward (last 100): -917.840
Episode 1300 Average Reward (last 100): -900.830
Episode 1400 Average Reward (last 100): -990.110
Episode 1500 Average Reward (last 100): -684.470
Episode 1600 Average Reward (last 100): -774.470
Episode 1700 Average Reward (last 100): -558.920
Episode 1800 Average Reward (last 100): -613.010
Episode 1900 

[32m[I 2022-10-02 20:14:06,190][0m Trial 9 finished with value: -200.0 and parameters: {'gamma': 0.7947988931592417, 'epsilon': 0.038578676730835204}. Best is trial 6 with value: -189.47.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #10: eps=0.0574394432649885, gamma=0.8593240579712539
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.280
Episode 300 Average Reward (last 100): -271.730
Episode 400 Average Reward (last 100): -343.910
Episode 500 Average Reward (last 100): -702.740
Episode 600 Average Reward (last 100): -792.740
Episode 700 Average Reward (last 100): -811.370
Episode 800 Average Reward (last 100): -882.290
Episode 900 Average Reward (last 100): -1026.650
Episode 1000 Average Reward (last 100): -1044.920
Episode 1100 Average Reward (last 100): -1062.290
Episode 1200 Average Reward (last 100): -1081.010
Episode 1300 Average Reward (last 100): -882.560
Episode 1400 Average Reward (last 100): -648.470
Episode 1500 Average Reward (last 100): -613.190
Episode 1600 Average Reward (last 100): -827.840
Episode 1700 Average Reward (last 100): -719.930
Episode 1800 Average Reward (last 100): -667.100
Episode 1900

[32m[I 2022-10-02 20:15:00,991][0m Trial 10 finished with value: -217.64 and parameters: {'gamma': 0.8593240579712539, 'epsilon': 0.0574394432649885}. Best is trial 6 with value: -189.47.[0m


Episode 20000 Average Reward (last 100): -217.640


  app.launch_new_instance()



TRIAL #11: eps=0.11237034617663662, gamma=0.7105245196343952
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -253.910
Episode 300 Average Reward (last 100): -271.100
Episode 400 Average Reward (last 100): -503.840
Episode 500 Average Reward (last 100): -612.020
Episode 600 Average Reward (last 100): -648.380
Episode 700 Average Reward (last 100): -918.110
Episode 800 Average Reward (last 100): -846.290
Episode 900 Average Reward (last 100): -952.310
Episode 1000 Average Reward (last 100): -952.940
Episode 1100 Average Reward (last 100): -810.200
Episode 1200 Average Reward (last 100): -935.840
Episode 1300 Average Reward (last 100): -864.290
Episode 1400 Average Reward (last 100): -971.930
Episode 1500 Average Reward (last 100): -792.920
Episode 1600 Average Reward (last 100): -881.480
Episode 1700 Average Reward (last 100): -739.190
Episode 1800 Average Reward (last 100): -613.190
Episode 1900 Average Reward (last 100): -612.470
Episode 2000 Ave

[32m[I 2022-10-02 20:15:55,710][0m Trial 11 finished with value: -189.42 and parameters: {'gamma': 0.7105245196343952, 'epsilon': 0.11237034617663662}. Best is trial 11 with value: -189.42.[0m


Episode 20000 Average Reward (last 100): -189.420

TRIAL #12: eps=0.111390575076001, gamma=0.642156757541797


  from ipykernel import kernelapp as app


Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.820
Episode 300 Average Reward (last 100): -289.190
Episode 400 Average Reward (last 100): -468.470
Episode 500 Average Reward (last 100): -541.280
Episode 600 Average Reward (last 100): -810.110
Episode 700 Average Reward (last 100): -863.840
Episode 800 Average Reward (last 100): -1026.830
Episode 900 Average Reward (last 100): -990.740
Episode 1000 Average Reward (last 100): -1044.290
Episode 1100 Average Reward (last 100): -864.380
Episode 1200 Average Reward (last 100): -918.290
Episode 1300 Average Reward (last 100): -1062.020
Episode 1400 Average Reward (last 100): -774.650
Episode 1500 Average Reward (last 100): -864.290
Episode 1600 Average Reward (last 100): -648.920
Episode 1700 Average Reward (last 100): -595.100
Episode 1800 Average Reward (last 100): -684.560
Episode 1900 Average Reward (last 100): -577.460
Episode 2000 Average Reward (last 100): -415.190
Episode 2100 Average Rewar

[32m[I 2022-10-02 20:16:50,896][0m Trial 12 finished with value: -197.86 and parameters: {'gamma': 0.642156757541797, 'epsilon': 0.111390575076001}. Best is trial 11 with value: -189.42.[0m


Episode 20000 Average Reward (last 100): -197.860

TRIAL #13: eps=0.12027429387135061, gamma=0.725559168002213
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -236.000
Episode 300 Average Reward (last 100): -289.640
Episode 400 Average Reward (last 100): -576.020
Episode 500 Average Reward (last 100): -666.830
Episode 600 Average Reward (last 100): -594.560
Episode 700 Average Reward (last 100): -773.390
Episode 800 Average Reward (last 100): -1026.110
Episode 900 Average Reward (last 100): -864.650
Episode 1000 Average Reward (last 100): -1079.660
Episode 1100 Average Reward (last 100): -1043.840
Episode 1200 Average Reward (last 100): -973.010
Episode 1300 Average Reward (last 100): -738.740
Episode 1400 Average Reward (last 100): -989.750
Episode 1500 Average Reward (last 100): -792.110
Episode 1600 Average Reward (last 100): -685.100
Episode 1700 Average Reward (last 100): -648.560
Episode 1800 Average Reward (last 100): -468.920
Episode 1900 

[32m[I 2022-10-02 20:17:45,940][0m Trial 13 finished with value: -213.2 and parameters: {'gamma': 0.725559168002213, 'epsilon': 0.12027429387135061}. Best is trial 11 with value: -189.42.[0m


Episode 20000 Average Reward (last 100): -213.200

TRIAL #14: eps=0.09638110096368914, gamma=0.9351345454035757
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -236.000
Episode 300 Average Reward (last 100): -342.920
Episode 400 Average Reward (last 100): -487.100
Episode 500 Average Reward (last 100): -703.280
Episode 600 Average Reward (last 100): -827.750
Episode 700 Average Reward (last 100): -739.550
Episode 800 Average Reward (last 100): -720.830
Episode 900 Average Reward (last 100): -936.200
Episode 1000 Average Reward (last 100): -1132.220
Episode 1100 Average Reward (last 100): -936.650
Episode 1200 Average Reward (last 100): -721.550
Episode 1300 Average Reward (last 100): -935.750
Episode 1400 Average Reward (last 100): -789.450
Episode 1500 Average Reward (last 100): -755.660
Episode 1600 Average Reward (last 100): -751.960
Episode 1700 Average Reward (last 100): -737.660
Episode 1800 Average Reward (last 100): -521.060
Episode 1900 A

[32m[I 2022-10-02 20:18:38,765][0m Trial 14 finished with value: -213.71 and parameters: {'gamma': 0.9351345454035757, 'epsilon': 0.09638110096368914}. Best is trial 11 with value: -189.42.[0m


Episode 20000 Average Reward (last 100): -213.710

TRIAL #15: eps=0.15311913820194045, gamma=0.684093429894286
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -271.460
Episode 300 Average Reward (last 100): -342.470
Episode 400 Average Reward (last 100): -469.190
Episode 500 Average Reward (last 100): -630.830
Episode 600 Average Reward (last 100): -648.830
Episode 700 Average Reward (last 100): -810.830
Episode 800 Average Reward (last 100): -936.560
Episode 900 Average Reward (last 100): -1080.380
Episode 1000 Average Reward (last 100): -1098.290
Episode 1100 Average Reward (last 100): -972.740
Episode 1200 Average Reward (last 100): -954.020
Episode 1300 Average Reward (last 100): -990.380
Episode 1400 Average Reward (last 100): -846.290
Episode 1500 Average Reward (last 100): -703.370
Episode 1600 Average Reward (last 100): -720.110
Episode 1700 Average Reward (last 100): -720.470
Episode 1800 Average Reward (last 100): -541.550
Episode 1900 A

[32m[I 2022-10-02 20:19:32,186][0m Trial 15 finished with value: -206.94 and parameters: {'gamma': 0.684093429894286, 'epsilon': 0.15311913820194045}. Best is trial 11 with value: -189.42.[0m


Episode 20000 Average Reward (last 100): -206.940

TRIAL #16: eps=0.09098739603067361, gamma=0.5428868516086074
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -270.830
Episode 300 Average Reward (last 100): -361.100
Episode 400 Average Reward (last 100): -487.190
Episode 500 Average Reward (last 100): -720.470
Episode 600 Average Reward (last 100): -648.560
Episode 700 Average Reward (last 100): -702.020
Episode 800 Average Reward (last 100): -666.380
Episode 900 Average Reward (last 100): -1008.200
Episode 1000 Average Reward (last 100): -1008.560
Episode 1100 Average Reward (last 100): -953.750
Episode 1200 Average Reward (last 100): -1008.560
Episode 1300 Average Reward (last 100): -990.200
Episode 1400 Average Reward (last 100): -793.190
Episode 1500 Average Reward (last 100): -810.470
Episode 1600 Average Reward (last 100): -757.010
Episode 1700 Average Reward (last 100): -702.470
Episode 1800 Average Reward (last 100): -720.740
Episode 1900

[32m[I 2022-10-02 20:20:24,513][0m Trial 16 finished with value: -168.71 and parameters: {'gamma': 0.5428868516086074, 'epsilon': 0.09098739603067361}. Best is trial 16 with value: -168.71.[0m


Episode 20000 Average Reward (last 100): -168.710

TRIAL #17: eps=0.08883968208202954, gamma=0.22559754408842642
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.100
Episode 300 Average Reward (last 100): -307.460
Episode 400 Average Reward (last 100): -504.290
Episode 500 Average Reward (last 100): -577.370
Episode 600 Average Reward (last 100): -809.120
Episode 700 Average Reward (last 100): -683.750
Episode 800 Average Reward (last 100): -880.580
Episode 900 Average Reward (last 100): -900.110
Episode 1000 Average Reward (last 100): -1096.580
Episode 1100 Average Reward (last 100): -846.560
Episode 1200 Average Reward (last 100): -1007.660
Episode 1300 Average Reward (last 100): -937.280
Episode 1400 Average Reward (last 100): -972.380
Episode 1500 Average Reward (last 100): -773.480
Episode 1600 Average Reward (last 100): -918.380
Episode 1700 Average Reward (last 100): -720.470
Episode 1800 Average Reward (last 100): -415.730
Episode 1900

[32m[I 2022-10-02 20:21:19,127][0m Trial 17 finished with value: -200.0 and parameters: {'gamma': 0.22559754408842642, 'epsilon': 0.08883968208202954}. Best is trial 16 with value: -168.71.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #18: eps=0.08138622541072824, gamma=0.503668321628398
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -218.000
Episode 300 Average Reward (last 100): -325.010
Episode 400 Average Reward (last 100): -432.830
Episode 500 Average Reward (last 100): -630.110
Episode 600 Average Reward (last 100): -863.300
Episode 700 Average Reward (last 100): -918.110
Episode 800 Average Reward (last 100): -882.740
Episode 900 Average Reward (last 100): -792.560
Episode 1000 Average Reward (last 100): -954.830
Episode 1100 Average Reward (last 100): -917.660
Episode 1200 Average Reward (last 100): -971.930
Episode 1300 Average Reward (last 100): -882.740
Episode 1400 Average Reward (last 100): -934.400
Episode 1500 Average Reward (last 100): -791.660
Episode 1600 Average Reward (last 100): -738.740
Episode 1700 Average Reward (last 100): -612.560
Episode 1800 Average Reward (last 100): -485.570
Episode 1900 Ave

[32m[I 2022-10-02 20:22:13,836][0m Trial 18 finished with value: -234.74 and parameters: {'gamma': 0.503668321628398, 'epsilon': 0.08138622541072824}. Best is trial 16 with value: -168.71.[0m


Episode 20000 Average Reward (last 100): -234.740

TRIAL #19: eps=0.18091385227838208, gamma=0.823214864869089
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -235.730
Episode 300 Average Reward (last 100): -289.460
Episode 400 Average Reward (last 100): -540.200
Episode 500 Average Reward (last 100): -647.930
Episode 600 Average Reward (last 100): -594.470
Episode 700 Average Reward (last 100): -863.750
Episode 800 Average Reward (last 100): -810.740
Episode 900 Average Reward (last 100): -756.920
Episode 1000 Average Reward (last 100): -972.830
Episode 1100 Average Reward (last 100): -1043.390
Episode 1200 Average Reward (last 100): -1008.110
Episode 1300 Average Reward (last 100): -1170.200
Episode 1400 Average Reward (last 100): -828.650
Episode 1500 Average Reward (last 100): -971.840
Episode 1600 Average Reward (last 100): -918.110
Episode 1700 Average Reward (last 100): -612.650
Episode 1800 Average Reward (last 100): -648.110
Episode 1900 

[32m[I 2022-10-02 20:23:08,847][0m Trial 19 finished with value: -197.9 and parameters: {'gamma': 0.823214864869089, 'epsilon': 0.18091385227838208}. Best is trial 16 with value: -168.71.[0m


Episode 20000 Average Reward (last 100): -197.900

TRIAL #20: eps=0.13697833152636996, gamma=0.5548420226907784
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -253.370
Episode 300 Average Reward (last 100): -396.830
Episode 400 Average Reward (last 100): -433.100
Episode 500 Average Reward (last 100): -701.390
Episode 600 Average Reward (last 100): -773.660
Episode 700 Average Reward (last 100): -667.100
Episode 800 Average Reward (last 100): -828.650
Episode 900 Average Reward (last 100): -918.110
Episode 1000 Average Reward (last 100): -1008.020
Episode 1100 Average Reward (last 100): -900.650
Episode 1200 Average Reward (last 100): -1043.750
Episode 1300 Average Reward (last 100): -881.300
Episode 1400 Average Reward (last 100): -918.020
Episode 1500 Average Reward (last 100): -828.380
Episode 1600 Average Reward (last 100): -594.920
Episode 1700 Average Reward (last 100): -738.110
Episode 1800 Average Reward (last 100): -684.200
Episode 1900 

[32m[I 2022-10-02 20:24:08,244][0m Trial 20 finished with value: -200.0 and parameters: {'gamma': 0.5548420226907784, 'epsilon': 0.13697833152636996}. Best is trial 16 with value: -168.71.[0m


Episode 20000 Average Reward (last 100): -200.000


### Execução

In [43]:

env = gym.make("Taxi-v3")

In [46]:
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.01
    GAMMA = 0.830525147061507
    EPSILON = 0.05919712699520377

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOffP(env, EPISODES, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOffP-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot,100, 'offPolicyD')

    # test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

Episode 100 Average Reward (last 100): -217.820
Episode 200 Average Reward (last 100): -307.010
Episode 300 Average Reward (last 100): -342.110
Episode 400 Average Reward (last 100): -572.510
Episode 500 Average Reward (last 100): -571.250
Episode 600 Average Reward (last 100): -481.610
Episode 700 Average Reward (last 100): -603.560
Episode 800 Average Reward (last 100): -462.440
Episode 900 Average Reward (last 100): -533.900
Episode 1000 Average Reward (last 100): -484.130
Episode 1100 Average Reward (last 100): -396.020
Episode 1200 Average Reward (last 100): -395.750
Episode 1300 Average Reward (last 100): -393.230
Episode 1400 Average Reward (last 100): -640.190
Episode 1500 Average Reward (last 100): -482.960
Episode 1600 Average Reward (last 100): -445.790
Episode 1700 Average Reward (last 100): -411.410
Episode 1800 Average Reward (last 100): -252.830
Episode 1900 Average Reward (last 100): -235.910
Episode 2000 Average Reward (last 100): -341.930
Episode 2100 Average Reward (

### Execução On-Policy

#### Otimiza Parâmetros

In [21]:
from numpy.random.mtrand import gamma

import optuna



ENV = gym.make("Taxi-v3")


# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_values(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    lr = trial.suggest_uniform('learning_rate', 0.001, 1.0)
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
   
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}")

    # roda o algoritmo e recebe os retornos não-descontados
    
    (returns, _) = run_montecarloOnP(ENV, 20000, lr, gamma, eps, render=False)
    return sum(returns[-100:])/100 

In [22]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_onpolice', 
                            load_if_exists=True)
study.optimize(train_values, n_trials=20) 

[32m[I 2022-10-02 20:28:52,242][0m A new study created in RDB with name: new_MC_onpolice[0m
  from ipykernel import kernelapp as app
  app.launch_new_instance()



TRIAL #0: eps=0.1609010081094748, gamma=0.4221866878016252
Episode 100 Average Reward (last 100): -438.800
Episode 200 Average Reward (last 100): -552.890
Episode 300 Average Reward (last 100): -332.100
Episode 400 Average Reward (last 100): -326.810
Episode 500 Average Reward (last 100): -322.940
Episode 600 Average Reward (last 100): -297.880
Episode 700 Average Reward (last 100): -304.430
Episode 800 Average Reward (last 100): -284.980
Episode 900 Average Reward (last 100): -305.570
Episode 1000 Average Reward (last 100): -296.570
Episode 1100 Average Reward (last 100): -283.950
Episode 1200 Average Reward (last 100): -282.900
Episode 1300 Average Reward (last 100): -297.010
Episode 1400 Average Reward (last 100): -279.750
Episode 1500 Average Reward (last 100): -282.970
Episode 1600 Average Reward (last 100): -296.480
Episode 1700 Average Reward (last 100): -290.780
Episode 1800 Average Reward (last 100): -282.830
Episode 1900 Average Reward (last 100): -286.990
Episode 2000 Avera

[32m[I 2022-10-02 20:29:53,774][0m Trial 0 finished with value: -280.63 and parameters: {'learning_rate': 0.9513257195608331, 'gamma': 0.4221866878016252, 'epsilon': 0.1609010081094748}. Best is trial 0 with value: -280.63.[0m


Episode 20000 Average Reward (last 100): -280.630

TRIAL #1: eps=0.09637271526233701, gamma=0.5743797421678166
Episode 100 Average Reward (last 100): -468.290
Episode 200 Average Reward (last 100): -634.070
Episode 300 Average Reward (last 100): -541.820
Episode 400 Average Reward (last 100): -330.410
Episode 500 Average Reward (last 100): -351.020
Episode 600 Average Reward (last 100): -260.030
Episode 700 Average Reward (last 100): -260.660
Episode 800 Average Reward (last 100): -261.830
Episode 900 Average Reward (last 100): -265.070
Episode 1000 Average Reward (last 100): -261.110
Episode 1100 Average Reward (last 100): -271.820
Episode 1200 Average Reward (last 100): -255.170
Episode 1300 Average Reward (last 100): -261.200
Episode 1400 Average Reward (last 100): -262.280
Episode 1500 Average Reward (last 100): -255.440
Episode 1600 Average Reward (last 100): -260.210
Episode 1700 Average Reward (last 100): -257.510
Episode 1800 Average Reward (last 100): -264.080
Episode 1900 Ave

[32m[I 2022-10-02 20:30:52,283][0m Trial 1 finished with value: -204.39 and parameters: {'learning_rate': 0.24544278326946914, 'gamma': 0.5743797421678166, 'epsilon': 0.09637271526233701}. Best is trial 1 with value: -204.39.[0m


Episode 20000 Average Reward (last 100): -204.390

TRIAL #2: eps=0.06088305356144375, gamma=0.47552343828885446
Episode 100 Average Reward (last 100): -415.820
Episode 200 Average Reward (last 100): -635.730
Episode 300 Average Reward (last 100): -578.360
Episode 400 Average Reward (last 100): -365.240
Episode 500 Average Reward (last 100): -283.830
Episode 600 Average Reward (last 100): -290.630
Episode 700 Average Reward (last 100): -258.140
Episode 800 Average Reward (last 100): -241.760
Episode 900 Average Reward (last 100): -237.080
Episode 1000 Average Reward (last 100): -257.510
Episode 1100 Average Reward (last 100): -252.830
Episode 1200 Average Reward (last 100): -259.670
Episode 1300 Average Reward (last 100): -244.370
Episode 1400 Average Reward (last 100): -234.470
Episode 1500 Average Reward (last 100): -238.790
Episode 1600 Average Reward (last 100): -259.850
Episode 1700 Average Reward (last 100): -242.930
Episode 1800 Average Reward (last 100): -231.770
Episode 1900 Av

[32m[I 2022-10-02 20:31:53,412][0m Trial 2 finished with value: -230.23 and parameters: {'learning_rate': 0.898794800018092, 'gamma': 0.47552343828885446, 'epsilon': 0.06088305356144375}. Best is trial 1 with value: -204.39.[0m


Episode 20000 Average Reward (last 100): -230.230

TRIAL #3: eps=0.191847293308802, gamma=0.3370714049856799
Episode 100 Average Reward (last 100): -478.460
Episode 200 Average Reward (last 100): -530.190
Episode 300 Average Reward (last 100): -376.620
Episode 400 Average Reward (last 100): -323.840
Episode 500 Average Reward (last 100): -335.180
Episode 600 Average Reward (last 100): -324.600
Episode 700 Average Reward (last 100): -328.000
Episode 800 Average Reward (last 100): -320.380
Episode 900 Average Reward (last 100): -316.720
Episode 1000 Average Reward (last 100): -313.440
Episode 1100 Average Reward (last 100): -298.850
Episode 1200 Average Reward (last 100): -313.070
Episode 1300 Average Reward (last 100): -295.830
Episode 1400 Average Reward (last 100): -300.390
Episode 1500 Average Reward (last 100): -303.450
Episode 1600 Average Reward (last 100): -306.280
Episode 1700 Average Reward (last 100): -303.350
Episode 1800 Average Reward (last 100): -289.610
Episode 1900 Avera

[32m[I 2022-10-02 20:32:55,277][0m Trial 3 finished with value: -298.74 and parameters: {'learning_rate': 0.6667861844904458, 'gamma': 0.3370714049856799, 'epsilon': 0.191847293308802}. Best is trial 1 with value: -204.39.[0m


Episode 20000 Average Reward (last 100): -298.740

TRIAL #4: eps=0.09334531198114047, gamma=0.5151976610936222
Episode 100 Average Reward (last 100): -365.950
Episode 200 Average Reward (last 100): -652.340
Episode 300 Average Reward (last 100): -394.990
Episode 400 Average Reward (last 100): -311.870
Episode 500 Average Reward (last 100): -284.600
Episode 600 Average Reward (last 100): -271.370
Episode 700 Average Reward (last 100): -273.710
Episode 800 Average Reward (last 100): -269.390
Episode 900 Average Reward (last 100): -283.700
Episode 1000 Average Reward (last 100): -268.120
Episode 1100 Average Reward (last 100): -254.630
Episode 1200 Average Reward (last 100): -251.960
Episode 1300 Average Reward (last 100): -250.490
Episode 1400 Average Reward (last 100): -236.140
Episode 1500 Average Reward (last 100): -232.310
Episode 1600 Average Reward (last 100): -250.540
Episode 1700 Average Reward (last 100): -269.120
Episode 1800 Average Reward (last 100): -247.960
Episode 1900 Ave

[32m[I 2022-10-02 20:33:55,407][0m Trial 4 finished with value: -252.98 and parameters: {'learning_rate': 0.6419422415888947, 'gamma': 0.5151976610936222, 'epsilon': 0.09334531198114047}. Best is trial 1 with value: -204.39.[0m


Episode 20000 Average Reward (last 100): -252.980

TRIAL #5: eps=0.1373661237654082, gamma=0.5848308543431441
Episode 100 Average Reward (last 100): -447.590
Episode 200 Average Reward (last 100): -778.930
Episode 300 Average Reward (last 100): -609.300
Episode 400 Average Reward (last 100): -637.490
Episode 500 Average Reward (last 100): -524.350
Episode 600 Average Reward (last 100): -385.250
Episode 700 Average Reward (last 100): -365.010
Episode 800 Average Reward (last 100): -315.710
Episode 900 Average Reward (last 100): -288.370
Episode 1000 Average Reward (last 100): -274.900
Episode 1100 Average Reward (last 100): -281.730
Episode 1200 Average Reward (last 100): -274.120
Episode 1300 Average Reward (last 100): -249.270
Episode 1400 Average Reward (last 100): -232.550
Episode 1500 Average Reward (last 100): -219.850
Episode 1600 Average Reward (last 100): -224.250
Episode 1700 Average Reward (last 100): -233.890
Episode 1800 Average Reward (last 100): -222.760
Episode 1900 Aver

[32m[I 2022-10-02 20:34:31,426][0m Trial 5 finished with value: -128.69 and parameters: {'learning_rate': 0.02736028937691178, 'gamma': 0.5848308543431441, 'epsilon': 0.1373661237654082}. Best is trial 5 with value: -128.69.[0m


Episode 20000 Average Reward (last 100): -128.690

TRIAL #6: eps=0.1397186840472309, gamma=0.8931777169859648
Episode 100 Average Reward (last 100): -444.800
Episode 200 Average Reward (last 100): -649.460
Episode 300 Average Reward (last 100): -477.830
Episode 400 Average Reward (last 100): -388.470
Episode 500 Average Reward (last 100): -327.880
Episode 600 Average Reward (last 100): -300.970
Episode 700 Average Reward (last 100): -288.090
Episode 800 Average Reward (last 100): -263.140
Episode 900 Average Reward (last 100): -310.340
Episode 1000 Average Reward (last 100): -283.390
Episode 1100 Average Reward (last 100): -279.930
Episode 1200 Average Reward (last 100): -281.590
Episode 1300 Average Reward (last 100): -272.470
Episode 1400 Average Reward (last 100): -277.780
Episode 1500 Average Reward (last 100): -263.180
Episode 1600 Average Reward (last 100): -268.630
Episode 1700 Average Reward (last 100): -257.060
Episode 1800 Average Reward (last 100): -270.110
Episode 1900 Aver

[32m[I 2022-10-02 20:35:25,727][0m Trial 6 finished with value: -240.21 and parameters: {'learning_rate': 0.4930824795281853, 'gamma': 0.8931777169859648, 'epsilon': 0.1397186840472309}. Best is trial 5 with value: -128.69.[0m


Episode 20000 Average Reward (last 100): -240.210

TRIAL #7: eps=0.0289892882598783, gamma=0.33911276583863076
Episode 100 Average Reward (last 100): -383.690
Episode 200 Average Reward (last 100): -622.010
Episode 300 Average Reward (last 100): -910.820
Episode 400 Average Reward (last 100): -633.080
Episode 500 Average Reward (last 100): -486.020
Episode 600 Average Reward (last 100): -300.980
Episode 700 Average Reward (last 100): -254.450
Episode 800 Average Reward (last 100): -274.950
Episode 900 Average Reward (last 100): -249.950
Episode 1000 Average Reward (last 100): -223.940
Episode 1100 Average Reward (last 100): -218.000
Episode 1200 Average Reward (last 100): -219.350
Episode 1300 Average Reward (last 100): -231.500
Episode 1400 Average Reward (last 100): -216.200
Episode 1500 Average Reward (last 100): -228.620
Episode 1600 Average Reward (last 100): -230.780
Episode 1700 Average Reward (last 100): -216.020
Episode 1800 Average Reward (last 100): -237.530
Episode 1900 Ave

[32m[I 2022-10-02 20:36:27,085][0m Trial 7 finished with value: -216.11 and parameters: {'learning_rate': 0.7690514432109695, 'gamma': 0.33911276583863076, 'epsilon': 0.0289892882598783}. Best is trial 5 with value: -128.69.[0m


Episode 20000 Average Reward (last 100): -216.110

TRIAL #8: eps=0.06440645377393996, gamma=0.7149448987940256
Episode 100 Average Reward (last 100): -363.710
Episode 200 Average Reward (last 100): -766.450
Episode 300 Average Reward (last 100): -811.190
Episode 400 Average Reward (last 100): -566.930
Episode 500 Average Reward (last 100): -437.780
Episode 600 Average Reward (last 100): -300.800
Episode 700 Average Reward (last 100): -296.840
Episode 800 Average Reward (last 100): -296.210
Episode 900 Average Reward (last 100): -253.550
Episode 1000 Average Reward (last 100): -240.590
Episode 1100 Average Reward (last 100): -266.510
Episode 1200 Average Reward (last 100): -238.250
Episode 1300 Average Reward (last 100): -239.960
Episode 1400 Average Reward (last 100): -243.020
Episode 1500 Average Reward (last 100): -247.070
Episode 1600 Average Reward (last 100): -241.490
Episode 1700 Average Reward (last 100): -238.610
Episode 1800 Average Reward (last 100): -240.860
Episode 1900 Ave

[32m[I 2022-10-02 20:37:24,490][0m Trial 8 finished with value: -145.92 and parameters: {'learning_rate': 0.16802333506762002, 'gamma': 0.7149448987940256, 'epsilon': 0.06440645377393996}. Best is trial 5 with value: -128.69.[0m



TRIAL #9: eps=0.05676126177200968, gamma=0.5956289485701647
Episode 100 Average Reward (last 100): -319.340
Episode 200 Average Reward (last 100): -645.390
Episode 300 Average Reward (last 100): -735.860
Episode 400 Average Reward (last 100): -501.500
Episode 500 Average Reward (last 100): -298.640
Episode 600 Average Reward (last 100): -270.380
Episode 700 Average Reward (last 100): -245.810
Episode 800 Average Reward (last 100): -232.490
Episode 900 Average Reward (last 100): -249.320
Episode 1000 Average Reward (last 100): -242.660
Episode 1100 Average Reward (last 100): -249.320
Episode 1200 Average Reward (last 100): -238.700
Episode 1300 Average Reward (last 100): -254.540
Episode 1400 Average Reward (last 100): -251.840
Episode 1500 Average Reward (last 100): -237.530
Episode 1600 Average Reward (last 100): -231.500
Episode 1700 Average Reward (last 100): -238.970
Episode 1800 Average Reward (last 100): -227.990
Episode 1900 Average Reward (last 100): -235.010
Episode 2000 Aver

[32m[I 2022-10-02 20:38:25,693][0m Trial 9 finished with value: -223.51 and parameters: {'learning_rate': 0.9744747051461735, 'gamma': 0.5956289485701647, 'epsilon': 0.05676126177200968}. Best is trial 5 with value: -128.69.[0m


Episode 20000 Average Reward (last 100): -223.510

TRIAL #10: eps=0.1360495516067039, gamma=0.07512905630841732
Episode 100 Average Reward (last 100): -434.540
Episode 200 Average Reward (last 100): -517.690
Episode 300 Average Reward (last 100): -489.640
Episode 400 Average Reward (last 100): -468.560
Episode 500 Average Reward (last 100): -401.320
Episode 600 Average Reward (last 100): -443.450
Episode 700 Average Reward (last 100): -394.780
Episode 800 Average Reward (last 100): -390.100
Episode 900 Average Reward (last 100): -329.580
Episode 1000 Average Reward (last 100): -328.510
Episode 1100 Average Reward (last 100): -303.340
Episode 1200 Average Reward (last 100): -313.760
Episode 1300 Average Reward (last 100): -285.220
Episode 1400 Average Reward (last 100): -293.530
Episode 1500 Average Reward (last 100): -296.230
Episode 1600 Average Reward (last 100): -260.680
Episode 1700 Average Reward (last 100): -252.050
Episode 1800 Average Reward (last 100): -294.700
Episode 1900 Av

[32m[I 2022-10-02 20:39:23,881][0m Trial 10 finished with value: -256.36 and parameters: {'learning_rate': 0.004088997703464603, 'gamma': 0.07512905630841732, 'epsilon': 0.1360495516067039}. Best is trial 5 with value: -128.69.[0m


Episode 20000 Average Reward (last 100): -256.360

TRIAL #11: eps=0.11964104486402141, gamma=0.8221643358580425
Episode 100 Average Reward (last 100): -457.670
Episode 200 Average Reward (last 100): -834.050
Episode 300 Average Reward (last 100): -915.860
Episode 400 Average Reward (last 100): -791.480
Episode 500 Average Reward (last 100): -549.510
Episode 600 Average Reward (last 100): -390.620
Episode 700 Average Reward (last 100): -397.640
Episode 800 Average Reward (last 100): -332.300
Episode 900 Average Reward (last 100): -318.800
Episode 1000 Average Reward (last 100): -289.190
Episode 1100 Average Reward (last 100): -289.370
Episode 1200 Average Reward (last 100): -276.590
Episode 1300 Average Reward (last 100): -276.390
Episode 1400 Average Reward (last 100): -268.520
Episode 1500 Average Reward (last 100): -272.720
Episode 1600 Average Reward (last 100): -270.920
Episode 1700 Average Reward (last 100): -270.740
Episode 1800 Average Reward (last 100): -274.070
Episode 1900 Av

[32m[I 2022-10-02 20:39:47,067][0m Trial 11 finished with value: -1.33 and parameters: {'learning_rate': 0.0365119453422717, 'gamma': 0.8221643358580425, 'epsilon': 0.11964104486402141}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -1.330

TRIAL #12: eps=0.13782991120842766, gamma=0.9862929977115259
Episode 100 Average Reward (last 100): -584.840
Episode 200 Average Reward (last 100): -1028.190
Episode 300 Average Reward (last 100): -880.510
Episode 400 Average Reward (last 100): -777.800
Episode 500 Average Reward (last 100): -522.560
Episode 600 Average Reward (last 100): -408.530
Episode 700 Average Reward (last 100): -383.060
Episode 800 Average Reward (last 100): -362.000
Episode 900 Average Reward (last 100): -305.750
Episode 1000 Average Reward (last 100): -323.480
Episode 1100 Average Reward (last 100): -308.000
Episode 1200 Average Reward (last 100): -314.840
Episode 1300 Average Reward (last 100): -320.560
Episode 1400 Average Reward (last 100): -372.890
Episode 1500 Average Reward (last 100): -378.980
Episode 1600 Average Reward (last 100): -330.860
Episode 1700 Average Reward (last 100): -356.830
Episode 1800 Average Reward (last 100): -320.460
Episode 1900 Ave

[32m[I 2022-10-02 20:40:14,254][0m Trial 12 finished with value: -4.4 and parameters: {'learning_rate': 0.057949408385926174, 'gamma': 0.9862929977115259, 'epsilon': 0.13782991120842766}. Best is trial 11 with value: -1.33.[0m


Episode 19700 Average Reward (last 100): -5.150
Episode 19800 Average Reward (last 100): -5.880
Episode 19900 Average Reward (last 100): -7.680
Episode 20000 Average Reward (last 100): -4.400

TRIAL #13: eps=0.11453588909733023, gamma=0.9914509744479565
Episode 100 Average Reward (last 100): -552.530
Episode 200 Average Reward (last 100): -930.170
Episode 300 Average Reward (last 100): -950.760
Episode 400 Average Reward (last 100): -752.510
Episode 500 Average Reward (last 100): -632.910
Episode 600 Average Reward (last 100): -666.110
Episode 700 Average Reward (last 100): -617.690
Episode 800 Average Reward (last 100): -642.640
Episode 900 Average Reward (last 100): -701.660
Episode 1000 Average Reward (last 100): -601.670
Episode 1100 Average Reward (last 100): -679.520
Episode 1200 Average Reward (last 100): -635.430
Episode 1300 Average Reward (last 100): -568.820
Episode 1400 Average Reward (last 100): -689.290
Episode 1500 Average Reward (last 100): -592.040
Episode 1600 Average

[32m[I 2022-10-02 20:41:02,036][0m Trial 13 finished with value: -496.62 and parameters: {'learning_rate': 0.3487500874198077, 'gamma': 0.9914509744479565, 'epsilon': 0.11453588909733023}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -496.620

TRIAL #14: eps=0.178303554210002, gamma=0.806235923635559
Episode 100 Average Reward (last 100): -432.110
Episode 200 Average Reward (last 100): -675.440
Episode 300 Average Reward (last 100): -524.460
Episode 400 Average Reward (last 100): -378.570
Episode 500 Average Reward (last 100): -337.440
Episode 600 Average Reward (last 100): -286.100
Episode 700 Average Reward (last 100): -308.830
Episode 800 Average Reward (last 100): -283.780
Episode 900 Average Reward (last 100): -296.120
Episode 1000 Average Reward (last 100): -295.250
Episode 1100 Average Reward (last 100): -279.900
Episode 1200 Average Reward (last 100): -280.960
Episode 1300 Average Reward (last 100): -278.000
Episode 1400 Average Reward (last 100): -286.590
Episode 1500 Average Reward (last 100): -290.020
Episode 1600 Average Reward (last 100): -283.780
Episode 1700 Average Reward (last 100): -264.780
Episode 1800 Average Reward (last 100): -252.720
Episode 1900 Avera

[32m[I 2022-10-02 20:41:38,035][0m Trial 14 finished with value: -141.57 and parameters: {'learning_rate': 0.1814071696617999, 'gamma': 0.806235923635559, 'epsilon': 0.178303554210002}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -141.570

TRIAL #15: eps=0.11809325593858834, gamma=0.9836935146890794
Episode 100 Average Reward (last 100): -500.360
Episode 200 Average Reward (last 100): -909.300
Episode 300 Average Reward (last 100): -812.980
Episode 400 Average Reward (last 100): -748.550
Episode 500 Average Reward (last 100): -631.430
Episode 600 Average Reward (last 100): -556.400
Episode 700 Average Reward (last 100): -698.690
Episode 800 Average Reward (last 100): -584.300
Episode 900 Average Reward (last 100): -532.910
Episode 1000 Average Reward (last 100): -707.240
Episode 1100 Average Reward (last 100): -633.620
Episode 1200 Average Reward (last 100): -582.280
Episode 1300 Average Reward (last 100): -575.840
Episode 1400 Average Reward (last 100): -574.940
Episode 1500 Average Reward (last 100): -538.670
Episode 1600 Average Reward (last 100): -599.380
Episode 1700 Average Reward (last 100): -512.750
Episode 1800 Average Reward (last 100): -533.650
Episode 1900 Av

[32m[I 2022-10-02 20:42:28,282][0m Trial 15 finished with value: -355.97 and parameters: {'learning_rate': 0.3915190380212933, 'gamma': 0.9836935146890794, 'epsilon': 0.11809325593858834}. Best is trial 11 with value: -1.33.[0m



TRIAL #16: eps=0.17120312699897092, gamma=0.7581367161554913
Episode 100 Average Reward (last 100): -475.280
Episode 200 Average Reward (last 100): -767.130
Episode 300 Average Reward (last 100): -643.290
Episode 400 Average Reward (last 100): -477.050
Episode 500 Average Reward (last 100): -361.190
Episode 600 Average Reward (last 100): -316.960
Episode 700 Average Reward (last 100): -325.880
Episode 800 Average Reward (last 100): -327.680
Episode 900 Average Reward (last 100): -320.310
Episode 1000 Average Reward (last 100): -296.450
Episode 1100 Average Reward (last 100): -293.690
Episode 1200 Average Reward (last 100): -299.010
Episode 1300 Average Reward (last 100): -268.970
Episode 1400 Average Reward (last 100): -273.410
Episode 1500 Average Reward (last 100): -282.000
Episode 1600 Average Reward (last 100): -257.420
Episode 1700 Average Reward (last 100): -274.320
Episode 1800 Average Reward (last 100): -259.880
Episode 1900 Average Reward (last 100): -270.340
Episode 2000 Ave

[32m[I 2022-10-02 20:43:00,973][0m Trial 16 finished with value: -104.28 and parameters: {'learning_rate': 0.08696755029089814, 'gamma': 0.7581367161554913, 'epsilon': 0.17120312699897092}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -104.280

TRIAL #17: eps=0.08355501033039156, gamma=0.8895446110709906
Episode 100 Average Reward (last 100): -386.960
Episode 200 Average Reward (last 100): -812.360
Episode 300 Average Reward (last 100): -772.510
Episode 400 Average Reward (last 100): -515.740
Episode 500 Average Reward (last 100): -320.080
Episode 600 Average Reward (last 100): -306.270
Episode 700 Average Reward (last 100): -271.010
Episode 800 Average Reward (last 100): -239.530
Episode 900 Average Reward (last 100): -273.310
Episode 1000 Average Reward (last 100): -241.260
Episode 1100 Average Reward (last 100): -253.790
Episode 1200 Average Reward (last 100): -259.230
Episode 1300 Average Reward (last 100): -265.890
Episode 1400 Average Reward (last 100): -259.060
Episode 1500 Average Reward (last 100): -271.560
Episode 1600 Average Reward (last 100): -245.510
Episode 1700 Average Reward (last 100): -281.360
Episode 1800 Average Reward (last 100): -275.700
Episode 1900 Av

[32m[I 2022-10-02 20:43:52,576][0m Trial 17 finished with value: -103.79 and parameters: {'learning_rate': 0.3008824605141798, 'gamma': 0.8895446110709906, 'epsilon': 0.08355501033039156}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -103.790

TRIAL #18: eps=0.15253182902235088, gamma=0.7120895256250555
Episode 100 Average Reward (last 100): -479.990
Episode 200 Average Reward (last 100): -652.520
Episode 300 Average Reward (last 100): -551.540
Episode 400 Average Reward (last 100): -420.410
Episode 500 Average Reward (last 100): -331.220
Episode 600 Average Reward (last 100): -330.470
Episode 700 Average Reward (last 100): -316.390
Episode 800 Average Reward (last 100): -297.770
Episode 900 Average Reward (last 100): -298.900
Episode 1000 Average Reward (last 100): -269.730
Episode 1100 Average Reward (last 100): -285.090
Episode 1200 Average Reward (last 100): -290.820
Episode 1300 Average Reward (last 100): -276.710
Episode 1400 Average Reward (last 100): -269.920
Episode 1500 Average Reward (last 100): -289.070
Episode 1600 Average Reward (last 100): -264.370
Episode 1700 Average Reward (last 100): -262.160
Episode 1800 Average Reward (last 100): -268.760
Episode 1900 Av

[32m[I 2022-10-02 20:44:38,392][0m Trial 18 finished with value: -170.79 and parameters: {'learning_rate': 0.1288231270602922, 'gamma': 0.7120895256250555, 'epsilon': 0.15253182902235088}. Best is trial 11 with value: -1.33.[0m



TRIAL #19: eps=0.12209385340171718, gamma=0.8641507048437724
Episode 100 Average Reward (last 100): -437.420
Episode 200 Average Reward (last 100): -567.470
Episode 300 Average Reward (last 100): -461.540
Episode 400 Average Reward (last 100): -363.800
Episode 500 Average Reward (last 100): -343.790
Episode 600 Average Reward (last 100): -289.820
Episode 700 Average Reward (last 100): -297.740
Episode 800 Average Reward (last 100): -281.630
Episode 900 Average Reward (last 100): -273.980
Episode 1000 Average Reward (last 100): -275.150
Episode 1100 Average Reward (last 100): -282.260
Episode 1200 Average Reward (last 100): -279.470
Episode 1300 Average Reward (last 100): -279.200
Episode 1400 Average Reward (last 100): -273.260
Episode 1500 Average Reward (last 100): -275.330
Episode 1600 Average Reward (last 100): -278.430
Episode 1700 Average Reward (last 100): -272.260
Episode 1800 Average Reward (last 100): -265.430
Episode 1900 Average Reward (last 100): -269.840
Episode 2000 Ave

[32m[I 2022-10-02 20:45:33,410][0m Trial 19 finished with value: -239.22 and parameters: {'learning_rate': 0.5027795200258759, 'gamma': 0.8641507048437724, 'epsilon': 0.12209385340171718}. Best is trial 11 with value: -1.33.[0m


Episode 20000 Average Reward (last 100): -239.220


#### Execução

In [47]:
ENV_NAME = "Taxi-v3"  
#ENV_NAME = "MountainCarContinuous-v0"  
#ENV_NAME = "LunarLander-v2"  
env = gym.make(ENV_NAME)

#parameters: {'learning_rate': 0.007617314337157292, 'gamma': 0.8762195844017038, 'epsilon': 0.1594319424288697}. Best is trial 16 with value: -2.2.
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.007617314337157292
    GAMMA = 0.8762195844017038
    EPSILON = 0.1594319424288697

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOnP(env, EPISODES, LR, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOnP-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot,100, 'onPolicyD')

    # test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

Episode 100 Average Reward (last 100): -490.880
Episode 200 Average Reward (last 100): -933.430
Episode 300 Average Reward (last 100): -989.570
Episode 400 Average Reward (last 100): -806.360
Episode 500 Average Reward (last 100): -558.750
Episode 600 Average Reward (last 100): -451.030
Episode 700 Average Reward (last 100): -470.900
Episode 800 Average Reward (last 100): -391.900
Episode 900 Average Reward (last 100): -437.080
Episode 1000 Average Reward (last 100): -417.480
Episode 1100 Average Reward (last 100): -308.200
Episode 1200 Average Reward (last 100): -316.640
Episode 1300 Average Reward (last 100): -321.960
Episode 1400 Average Reward (last 100): -262.400
Episode 1500 Average Reward (last 100): -311.510
Episode 1600 Average Reward (last 100): -251.250
Episode 1700 Average Reward (last 100): -226.730
Episode 1800 Average Reward (last 100): -220.300
Episode 1900 Average Reward (last 100): -220.680
Episode 2000 Average Reward (last 100): -210.440
Episode 2100 Average Reward (

## Contínuo

In [51]:

ENV_NAME = "MountainCar-v0"  

env = gym.make(ENV_NAME)

In [25]:

class GeneralDiscretizer:
    def __init__(self, env, bins_per_dimension):
        self.bins_per_dim = bins_per_dimension.copy()
        self.intervals_per_dim = []
        self.total_bins = 1
        for i, bins in enumerate(bins_per_dimension):
            self.intervals_per_dim.append(
                np.linspace(env.observation_space.low[i], env.observation_space.high[i], bins+1) )
            self.total_bins *= bins

    def to_single_bin(self, state):
        bin_vector = [(np.digitize(x=state[i], bins=intervals) - 1)
                      for i, intervals in enumerate(self.intervals_per_dim)]
        # print(bin_vector)
        return self._bin_vector_to_single_bin(bin_vector, len(bin_vector)-1)

    def _bin_vector_to_single_bin(self, vector, index):
        if index < 0:
            return 0
        return vector[index] + self.bins_per_dim[index] * self._bin_vector_to_single_bin(vector, index-1)

    def get_total_bins(self):
        return self.total_bins


class DiscreteObservationWrapper(gym.ObservationWrapper):
    '''Classe para converter espaços contínuos em espaços discretos.
    Esta classe converte ambientes de observações (estados) contínuos em ambientes de estados
    discretos. Especificamente, ele converte representações dadas na forma de array de valores float
    em um único inteiro $\geq$ não-negativo (>=0).
    
    Precisa passar para o construtor uma lista que informa em quantos "bins" vai ser discretizada 
    cada dimensão (ou seja, cada valor float) do espaço de estados original.
    '''
    
    def __init__(self, env, BINS_PER_DIMENSION):
        super().__init__(env)
        # cria um GeneralDiscretizer para converter um array de valores float em um único inteiro >= 0
        # precisa dizer em quantos "bins" vai ser discretizada cada dimensão
        self.discretizer = GeneralDiscretizer(env, BINS_PER_DIMENSION)
        self.observation_space = gym.spaces.Discrete(self.discretizer.get_total_bins())

    def observation(self, obs):
        return self.discretizer.to_single_bin(obs)


In [54]:
from numpy.random.mtrand import gamma

import optuna



# Esta função faz um treinamento com o Expected-SARSA, usando parâmetros sugeridos pelo Optuna.
# Retorna a média dos retornos dos últimos 100 episódios.
def train_valuesOf(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    bins1 = trial.suggest_int('bins1', 5.0, 80.0)
    bins2 = trial.suggest_int('bins2', 10.0, 90.0)
    
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}, bins1={bins1},bins2={bins2}")

    # roda o algoritmo e recebe os retornos não-descontados
    env_wrapper = DiscreteObservationWrapper(env, [bins1,bins2])
    (returns, _) = run_montecarloOffP(env_wrapper, 20000, gamma,eps, render=False)
    return sum(returns[-100:])/100

In [55]:
def train_valuesOn(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    bins1 = trial.suggest_int('bins1', 5.0, 80.0)
    bins2 = trial.suggest_int('bins2', 10.0, 90.0)
   
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma}, bins1={bins1},bins2={bins2}")

    # roda o algoritmo e recebe os retornos não-descontados
    env_wrapper = DiscreteObservationWrapper(env, [bins1,bins2])
    (returns, _) = run_montecarloOffP(env_wrapper, 20000, gamma, eps, render=False)
    return sum(returns[-100:])/100

In [52]:
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice_cont_bins_on-Policy', 
                            load_if_exists=True)


study2 = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice_cont_bins_off-policy', 
                            load_if_exists=True)


study.optimize(train_valuesOn, n_trials=30) 


[32m[I 2022-10-03 00:09:40,314][0m Using an existing study with name 'new_MC_offpolice_cont_bins_on-Policy' instead of creating a new one.[0m
[32m[I 2022-10-03 00:09:40,371][0m Using an existing study with name 'new_MC_offpolice_cont_bins_off-policy' instead of creating a new one.[0m
  after removing the cwd from sys.path.
  """



TRIAL #24: eps=0.01243910657028749, gamma=0.17709806307166054, bins1=25,bins2=34
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.000
Episode 1900 Average Reward (last 100): -200.

[32m[I 2022-10-03 00:14:30,119][0m Trial 24 finished with value: -200.0 and parameters: {'gamma': 0.17709806307166054, 'epsilon': 0.01243910657028749, 'bins1': 25, 'bins2': 34}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #25: eps=0.134119152413946, gamma=0.6469940384249309, bins1=65,bins2=72
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.00

[32m[I 2022-10-03 00:19:20,000][0m Trial 25 finished with value: -200.0 and parameters: {'gamma': 0.6469940384249309, 'epsilon': 0.134119152413946, 'bins1': 65, 'bins2': 72}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #26: eps=0.03948561946817283, gamma=0.3740441736231128, bins1=22,bins2=34
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.

[32m[I 2022-10-03 00:24:10,336][0m Trial 26 finished with value: -200.0 and parameters: {'gamma': 0.3740441736231128, 'epsilon': 0.03948561946817283, 'bins1': 22, 'bins2': 34}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #27: eps=0.025547977846573755, gamma=0.1861666364956458, bins1=24,bins2=33
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200

[32m[I 2022-10-03 00:28:58,476][0m Trial 27 finished with value: -200.0 and parameters: {'gamma': 0.1861666364956458, 'epsilon': 0.025547977846573755, 'bins1': 24, 'bins2': 33}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #28: eps=0.1307092287591637, gamma=0.7002560993049122, bins1=69,bins2=75
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.0

[32m[I 2022-10-03 00:33:52,385][0m Trial 28 finished with value: -200.0 and parameters: {'gamma': 0.7002560993049122, 'epsilon': 0.1307092287591637, 'bins1': 69, 'bins2': 75}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #29: eps=0.04464299441402467, gamma=0.34596849754486786, bins1=19,bins2=32
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200

[32m[I 2022-10-03 00:38:41,486][0m Trial 29 finished with value: -200.0 and parameters: {'gamma': 0.34596849754486786, 'epsilon': 0.04464299441402467, 'bins1': 19, 'bins2': 32}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #30: eps=0.024423553364560625, gamma=0.1296318650254198, bins1=32,bins2=49
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200

[32m[I 2022-10-03 00:43:31,382][0m Trial 30 finished with value: -200.0 and parameters: {'gamma': 0.1296318650254198, 'epsilon': 0.024423553364560625, 'bins1': 32, 'bins2': 49}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #31: eps=0.11762582582054458, gamma=0.7401606286756673, bins1=77,bins2=76
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.

[32m[I 2022-10-03 00:48:20,207][0m Trial 31 finished with value: -200.0 and parameters: {'gamma': 0.7401606286756673, 'epsilon': 0.11762582582054458, 'bins1': 77, 'bins2': 76}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #32: eps=0.057407355947329625, gamma=0.2802213685906601, bins1=17,bins2=25
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200

[32m[I 2022-10-03 00:53:10,135][0m Trial 32 finished with value: -200.0 and parameters: {'gamma': 0.2802213685906601, 'epsilon': 0.057407355947329625, 'bins1': 17, 'bins2': 25}. Best is trial 7 with value: -199.26.[0m


Episode 20000 Average Reward (last 100): -200.000

TRIAL #33: eps=0.0840884898242836, gamma=0.0928297260596422, bins1=36,bins2=54
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.0

[33m[W 2022-10-03 00:56:05,733][0m Trial 33 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-49-dfb51fb42675>", line 14, in train_valuesOn
    (returns, _) = run_montecarloOffP(env_wrapper, 20000, gamma, eps, render=False)
  File "<ipython-input-41-2304de231cae>", line 41, in run_montecarloOffP
    next_state, reward, done, _ = env.step(action)
  File "/usr/local/lib/python3.7/dist-packages/gym/core.py", line 324, in step
    return self.observation(observation), reward, done, info
  File "<ipython-input-25-24e6f88ce49e>", line 44, in observation
    return self.discretizer.to_single_bin(obs)
  File "<ipython-input-25-24e6f88ce49e>", line 13, in to_single_bin
    for i, intervals in enumerate(self.intervals_per_dim)]
  File "<ipython-input-25-24e6f88ce49e>", line 13, in <listcomp>
 

KeyboardInterrupt: ignored

In [None]:
study2.optimize(train_valuesOf, n_trials=30)

In [61]:
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    GAMMA = 0.830525147061507
    EPSILON = 0.05919712699520377

    print("Ambiente Contínuo")
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    print("Monte-Carlo On-Policy")
    env_wrapper = DiscreteObservationWrapper(env, [70,70])
    rewardsOn, QtableOn = run_montecarloOnP(env_wrapper, EPISODES, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))
    filename = f"results/montecarloOnPolicyCont-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot,100, 'onPolicyC')

    #test_greedy_Q_policy(env, QtableOn, 10, True)


    

Ambiente Contínuo
Monte-Carlo On-Policy
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.000
Episode 1900 Average Reward (last 100): -200.000
Episode 2000 Average Reward (last 100)

In [59]:
    env_wrapper = DiscreteObservationWrapper(env, [70,70])
    print("Monte-Carlo Off-Policy")
    env_wrapper = DiscreteObservationWrapper(env, [70,70])
    rewardsOff, QtableOff = run_montecarloOffP(env_wrapper, EPISODES, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOffPolicyCont-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot,100, 'offPolicyC')

    #test_greedy_Q_policy(env, QtableOff, 10, True)
    env.close()

Monte-Carlo Off-Policy
Episode 100 Average Reward (last 100): -200.000
Episode 200 Average Reward (last 100): -200.000
Episode 300 Average Reward (last 100): -200.000
Episode 400 Average Reward (last 100): -200.000
Episode 500 Average Reward (last 100): -200.000
Episode 600 Average Reward (last 100): -200.000
Episode 700 Average Reward (last 100): -200.000
Episode 800 Average Reward (last 100): -200.000
Episode 900 Average Reward (last 100): -200.000
Episode 1000 Average Reward (last 100): -200.000
Episode 1100 Average Reward (last 100): -200.000
Episode 1200 Average Reward (last 100): -200.000
Episode 1300 Average Reward (last 100): -200.000
Episode 1400 Average Reward (last 100): -200.000
Episode 1500 Average Reward (last 100): -200.000
Episode 1600 Average Reward (last 100): -200.000
Episode 1700 Average Reward (last 100): -200.000
Episode 1800 Average Reward (last 100): -200.000
Episode 1900 Average Reward (last 100): -200.000
Episode 2000 Average Reward (last 100): -200.000
Episod

Continuo Ambiente 2 ()

In [101]:
!pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [112]:
# ENV_NAME = "CarRacing-v1"  

# env = gym.make(ENV_NAME)
env = gym.make("Pendulum-v1")


In [113]:
def train_valuesOnCar(trial : optuna.Trial):
    
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    gamma = trial.suggest_uniform('gamma', 0.02, 1.0)
    eps = trial.suggest_uniform('epsilon', 0.01, 0.2)
    # bins1 = trial.suggest_int('bins1', -1, 1)
    bins2 = trial.suggest_int('bins2', 5.0, 30.0)
    bins3 = trial.suggest_int('bins3', 5.0, 10.0)
   
    
    print(f"\nTRIAL #{trial.number}: eps={eps}, gamma={gamma},bins2={bins2},bins3={bins3}")

    # roda o algoritmo e recebe os retornos não-descontados
    env_wrapper = DiscreteObservationWrapper(env, [bins2,bins3])
    (returns, _) = run_montecarloOffP(env_wrapper, 2000, gamma, eps, render=False)
    return sum(returns[-100:])/100

In [114]:

study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///optuna_studies.db', 
                            study_name= 'new_MC_offpolice_cont_bins_off-policy', 
                            load_if_exists=True)

study.optimize(train_valuesOnCar, n_trials=10) 

[32m[I 2022-10-03 03:20:11,642][0m Using an existing study with name 'new_MC_offpolice_cont_bins_off-policy' instead of creating a new one.[0m
  after removing the cwd from sys.path.
  """
[33m[W 2022-10-03 03:20:11,796][0m Trial 10 failed because of the following error: AssertionError()[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-113-530e0fbf8961>", line 15, in train_valuesOnCar
    (returns, _) = run_montecarloOffP(env_wrapper, 2000, gamma, eps, render=False)
  File "<ipython-input-41-2304de231cae>", line 11, in run_montecarloOffP
    assert isinstance(env.action_space, gym.spaces.Discrete)
AssertionError



TRIAL #10: eps=0.07110540318920813, gamma=0.318805085247301,bins2=6,bins3=8


AssertionError: ignored

In [124]:
if __name__ == "__main__":
    r_max_plot = 10

    EPISODES = 100000
    LR = 0.01
    GAMMA = 0.830525147061507
    EPSILON = 0.05919712699520377
 
    #env_wrapper = DiscreteObservationWrapper(env, [1,80,90])

    env = gym.make("FrozenLake-v1")

    
    # Roda o algoritmo Monte-Carlo para o problema de controle (ou seja, para achar a política ótima)
    rewards, Qtable = run_montecarloOffP(env, EPISODES, GAMMA, EPSILON, render=False)
    print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

    # Mostra um gráfico de episódios x retornos (não descontados)
    # Se quiser salvar, passe o nome do arquivo no 3o parâmetro
    filename = f"results/montecarloOffPCar-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
    plot_result(rewards, r_max_plot,100, 'offPolicyC-CarRacing')

    # test_greedy_Q_policy(env, Qtable, 10, True)
    env.close()

AssertionError: ignored

In [121]:
rewards, Qtable = run_montecarloOnP(env, EPISODES, GAMMA, EPSILON, render=False)
print("Últimos resultados: media =", np.mean(rewards[-20:]), ", desvio padrao =", np.std(rewards[-20:]))

filename = f"results/montecarloOffPCar-{ENV_NAME.lower()[0:8]}-ep{EPISODES}.png"
plot_result(rewards, r_max_plot,100, 'onPolicyC-CarRacing')

   
env.close()

Episode 100 Average Reward (last 100): 0.000
Episode 200 Average Reward (last 100): 0.000
Episode 300 Average Reward (last 100): 0.000
Episode 400 Average Reward (last 100): 0.010
Episode 500 Average Reward (last 100): 0.040
Episode 600 Average Reward (last 100): 0.040
Episode 700 Average Reward (last 100): 0.070
Episode 800 Average Reward (last 100): 0.090
Episode 900 Average Reward (last 100): 0.090
Episode 1000 Average Reward (last 100): 0.040
Episode 1100 Average Reward (last 100): 0.020
Episode 1200 Average Reward (last 100): 0.060
Episode 1300 Average Reward (last 100): 0.000
Episode 1400 Average Reward (last 100): 0.040
Episode 1500 Average Reward (last 100): 0.000
Episode 1600 Average Reward (last 100): 0.050
Episode 1700 Average Reward (last 100): 0.060
Episode 1800 Average Reward (last 100): 0.050
Episode 1900 Average Reward (last 100): 0.050
Episode 2000 Average Reward (last 100): 0.010
Episode 2100 Average Reward (last 100): 0.040
Episode 2200 Average Reward (last 100): 0.0

# Plot Result

In [150]:
def test_greedy_Q_policy(env, Q, num_episodes=100, render=False, render_wait=0.01, video=None):
    """
    Avalia a política gulosa (greedy) definida implicitamente por uma Q-table.
    Ou seja, executa, em todo estado s, a ação "a = argmax Q(s,a)".
    - env: o ambiente
    - Q: a Q-table (tabela Q) que será usada
    - num_episodes: quantidade de episódios a serem executados
    - render: defina como True se deseja chamar `env.render()` a cada passo
    - render_wait: intervalo de tempo entre as chamadas a `env.render()`
    - video 
    
    Retorna:
    - um par contendo o valor escalar do retorno médio por episódio e 
       a lista de retornos de todos os episódios
    """
    episode_returns = []
    total_steps = 0
    for i in range(num_episodes):
        print(f"Episode {i+1}")
        obs = env.reset()
        if render:
            env.render()
            time.sleep(render_wait)
        if video is not None:
            video.capture_frame()
        done = False
        episode_returns.append(0.0)
        while not done:
            action = np.argmax(Q[obs])
            obs, reward, done, _ = env.step(action)
            if render:
                env.render()
                time.sleep(render_wait)
            if video is not None:
                video.capture_frame()
            total_steps += 1
            episode_returns[-1] += reward
        print("- retorno:", episode_returns[-1])
    mean_return = round(np.mean(episode_returns), 1)
    print("Retorno médio (por episódio):", mean_return, end="")
    print(", episódios:", len(episode_returns), end="")
    print(", total de passos:", total_steps)
    show_state(env,total_steps)
    if video is not None:
        video.close()
    return mean_return, episode_returns

In [40]:
def smooth(data, window):
  data = np.array(data)
  n = len(data)
  y = np.zeros(n)
  for i in range(n):
    start = max(0, i-window+1)
    y[i] = data[start:(i+1)].mean()
  return y

def plot_result(returns, ymax_suggested=None, window=100, filename=None):
    '''Exibe um gráfico "retornos x recompensas", fazendo a média a cada 100 retornos, para suavizar.     
    Se o parâmetro filename for fornecido, salva o gráfico em arquivo ao invés de exibir.
    
    Parâmetros:
    - returns: lista de retornos a cada episódio
    - ymax_suggested (opcional): valor máximo de retorno (eixo y), se tiver um valor máximo conhecido previamente
    - filename: indique um nome de arquivo, se quiser salvar a imagem do gráfico; senão, o gráfico será apenas exibido
    '''
    plt.figure(figsize=(14,8))
    smoothed_returns = smooth(returns, window)
    xvalues = np.arange(1, len(returns)+1)
    plt.plot(xvalues, smoothed_returns)
    plt.xlabel('Episódios')
    plt.ylabel('Retorno')
    if ymax_suggested is not None:
        ymax = np.max([ymax_suggested, np.max(smoothed_returns)])
        plt.ylim(top=ymax)
    plt.title(f"Retorno médio a cada {window} episódios")
    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
        print("Arquivo salvo:", filename)
    plt.close()


# Exibe o ambiente

In [62]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder

In [152]:
ENV_NAME = 'MountainCar-v0'

In [153]:
env = gym.make(ENV_NAME)

In [148]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [149]:
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % (env._spec.id,step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [154]:
#record_video(ENV_NAME, model, video_length=1000, prefix='monte-car-off-police')

videoOf = VideoRecorder(env, "monte-car-off-police.mp4")
#videoOn = VideoRecorder(env, "monte-car-on-police.mp4")
#test_greedy_Q_policy(env, QtableOn, 10, True,videoOn)
#render_mp4('video_offpolicy')

test_greedy_Q_policy(env, QtableOn, 10, True,videoOf)
render_mp4('monte-car-off-police.mp4')
#show_videos('videos', prefix='monte-car-off-police')

ValueError: ignored