In [2]:
import gym 
import numpy as np
import math
from collections import defaultdict, deque

In [3]:
env = gym.make('Taxi-v3')

In [4]:
env.action_space.sample()

3

In [5]:
env.action_space.n

6

In [11]:
def take_action(Q_s, len_action_space ,eps):
    
    use_greedy = np.random.random()
    
    if use_greedy < eps:
        return np.random.choice(np.arange(len_action_space))
    else:
        return np.argmax(Q_s)
    
    
def Update_Q(Q_sa, Q_sa_next, reward, alpha, gamma):
    
    return Q_sa + (alpha*(reward + (gamma*Q_sa_next) - Q_sa))

In [14]:
def Q_learning(env, episodes, gamma, alpha, eps):
    
    avg_rewards = deque(maxlen=episodes)
    best_avg_reward = -np.inf
    samp_rewards = deque(maxlen=100)
    
    len_action_space = env.action_space.n
    Q = defaultdict(lambda: np.zeros(len_action_space))
    
    for e in range(1, episodes+1):
        
        state = env.reset()
        
        samp_reward = 0
        
        while True:

            action = take_action(Q[state], len_action_space, eps)
            next_state, reward, done, _ = env.step(action)
            samp_reward += reward
            
            qs_argmax = np.argmax(Q[next_state])
            
            greedy_Next_Q_S = Q[next_state][qs_argmax] if not done else 0

            td_error = alpha*((reward + (gamma*greedy_Next_Q_S)) - Q[state][action])

            Q[state][action] += td_error
            
            state = next_state

            if done:
                samp_rewards.append(samp_reward)
                break
                
            if e >= 100:
                
                avg_reward = np.mean(samp_rewards)
                avg_rewards.append(avg_reward)
                
                if avg_reward > best_avg_reward:
                    best_avg_reward = avg_reward
        
            print("\rEpisode {}/{} || Best average reward {}".format(e, episodes, best_avg_reward), end="", flush=True)
        
    policy = dict((state, argmax) for state, argmax in Q.items())
    return Q, policy

In [16]:
Q_learning(env, episodes=12000, gamma=.4, alpha=1., eps=0.0)

Episode 12000/12000 || Best average reward 8.55669696969697

(defaultdict(<function __main__.Q_learning.<locals>.<lambda>()>,
             {267: array([ -1.6655744 ,  -1.66098688,  -1.663936  ,  -1.663936  ,
                     -10.        , -10.        ]),
              367: array([ -1.66622976,  -1.66439475,  -1.6655744 ,  -1.6655744 ,
                     -10.        , -10.        ]),
              467: array([ -1.6664919 ,  -1.6657579 ,  -1.66622976,  -1.66622976,
                     -10.        , -10.        ]),
              167: array([ -1.6655744,  -1.6524672,  -1.65984  ,  -1.65984  , -10.       ,
                     -10.       ]),
              287: array([ -1.663936 ,  -1.6524672,  -1.65984  ,  -1.65984  , -10.       ,
                     -10.       ]),
              387: array([ -1.6655744 ,  -1.66098688,  -1.663936  ,  -1.663936  ,
                     -10.        , -10.        ]),
              487: array([ -1.66622976,  -1.66439475,  -1.6655744 ,  -1.66622976,
                     -10.        , -10.        ]),
              18