In [1]:
import gym
import sys
import numpy as np
import math
from collections import defaultdict, deque

In [2]:
env = gym.make('Taxi-v3')

In [3]:
def escolher_acao(Q_s, epsilon, num_acoes):
    
    utlizar_greedy_policy = np.random.random()
    
    if utlizar_greedy_policy < epsilon:
        return np.random.choice(np.arange(num_acoes))
    else:
        return np.argmax(Q_s)
    
def atualizar_Q(Q_sa, Q_sa_next, reward, alpha, gamma, gamma_uprate, max_gamma):
    
    return Q_sa + (alpha*(reward + (gamma*Q_sa_next) - Q_sa))

def obter_expected_Q(Q_s, epsilon, num_acoes):
    
    probs = np.ones(num_acoes) * epsilon/num_acoes
    best_a = np.argmax(Q_s)
    probs[best_a] = 1- epsilon + (epsilon/num_acoes)
    
    return sum(probs* Q_s)

In [4]:
def QLearning(env, num_episodes, gamma, alpha, epsilon = 0.0, gamma_uprate= 0.0001, max_gamma = 1.0, eps_decay = 0.999, eps_min = 0.005):
    
    # initialize average rewards
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=100)
    
    num_acoes = env.action_space.n
    Q = defaultdict(lambda : np.zeros(num_acoes))
    
    for i_episode in range(num_episodes + 1):
            
        state = env.reset()
        epsilon = max(epsilon*eps_decay, eps_min)
        
        samp_reward = 0
        while True:
            action = escolher_acao(Q[state], epsilon, num_acoes)
            next_state, reward, done, info = env.step(action)
            samp_reward += reward
            
            if not done:
                
                maior_estimativa = np.argmax(Q[next_state])
                estimative = Q[next_state][maior_estimativa]
                #estimative  = obter_expected_Q(Q[next_state], epsilon, num_acoes)
                Q[state][action], gamma = atualizar_Q(Q[state][action], estimative, reward, alpha, gamma, gamma_uprate, max_gamma)
                
                state = next_state
                
            else:
                samp_rewards.append(samp_reward)
                Q[state][action], gamma = atualizar_Q(Q[state][action], 0, reward, alpha, gamma, gamma_uprate, max_gamma)
                break
        
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
            
    return Q

In [6]:
Q = QLearning(env, num_episodes=12000, gamma=0.40, alpha=1., epsilon = 0.0, eps_min = 0.0)

Episode 12000/12000 || Best average reward 8.863

