### Import

In [3]:
import gym
import pygame
import random
from io import StringIO ## for Python 3
from gym import Env, spaces, utils
#from gym.envs.toy_text.utils import categorical_sample
import numpy as np
import time
from IPython.display import clear_output

### Init game env

In [4]:
env_name = "Taxi-v3"
env = gym.make(env_name)
q_table = np.zeros((env.observation_space.n, env.action_space.n))
print('Observation:', env.observation_space.n)
print('Action:', env.action_space.n)

Observation: 500
Action: 6


### Init Q learning params

In [5]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

num_episode = 1000
max_step = 100

# How quickly abandon the previous q value in the table for the new q value for the same pair (s,a) at a later timestep
learning_rate = 0.1
discount_rate = 0.99 # gamma

# prob to exploring over exploit env (greedy) : if r(0, 1) > exploration_rate = exploit else explore
exploration_rate = 1
max_exploration = 1
min_exploration = 0.01
exploration_decay_rate = 0.001

# Calculate the new q value
# Qnew(s,a) = (1 - lr) * Qold(s,a) + (lr * Qlearned(s,a))
# Qlearned = (ResAction + (dr * maxQt+1([s, ...a])))

print ('Q Table:', q_table)

Q Table: [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


### Agent

In [6]:
class QAgent():
    def __init__(self, env):
        self.action_space = env.action_space
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
        self.learning_rate = 0.1
        self.discount_rate = 0.99
        self.exploration_rate = 1
        self.max_exploration = 1
        self.min_exploration = 0.01
        self.exploration_decay_rate = 0.001

    def get_action(self, state):
        greedy = random.uniform(0, 1)
        if greedy > self.exploration_rate:
            action = np.argmax(self.q_table[state, :])
        else:
            action = self.action_space.sample()
        return action
    
    def update_q_table(self, state, action, reward, new_state):
        self.q_table[state, action] = (1 - self.learning_rate) * self.q_table[state, action] + \
        (self.learning_rate * (reward + (self.discount_rate * np.max(self.q_table[new_state, :]))))
        return 1
    
    def decay_exploration_rate(self, episode):
        self.exploration_rate = self.min_exploration + (self.max_exploration - self.min_exploration) * np.exp(-self.exploration_decay_rate * episode)
        return 1

### Game training

In [7]:
agent = Agent(env)
rewards = []
num_episode = 2000
max_step = 100
for episode in range(num_episode):
    state = env.reset()
    reward_episode = 0
    print("Episode: "+str(episode), end="\r")
    for s in range(max_step):
        #print(env.render('ansi'), end="\r")
        action = agent.get_action(state)
        new_state, reward, done, info = env.step(action)
        agent.update_q_table(state, action, reward, new_state)
        state = new_state
        reward_episode += reward
        if done:
            break
    agent.decay_exploration_rate(episode)
    rewards.append(reward_episode)


Episode: 1999

### Game score output

In [8]:
reward_per_thousand_episode = np.split(np.array(rewards), num_episode / 1000)
count = 1000
print('*** Average rewards per 1000 episodes ***')
for r in reward_per_thousand_episode:
    print(count, ': ', str(sum(r/1000)))
    count += 1000
    
    
print('*** Q Table ***')
print(agent.q_table)

*** Average rewards per 1000 episodes ***
1000 :  -249.15699999999993
2000 :  -35.93700000000005
*** Q Table ***
[[ 0.          0.          0.          0.          0.          0.        ]
 [-3.03170889 -3.52778272 -3.49410222 -3.06755257  8.29507166 -9.77760675]
 [-1.53780801  0.2198823  -2.14647181  0.26883988 14.01316291 -7.25496642]
 ...
 [-1.28957559  5.66442788 -1.51871089 -1.43433587 -3.64407635 -6.84049641]
 [-3.20816218 -2.44082678 -3.27083891 -3.22766279 -9.13277614 -8.15572745]
 [-0.3439      0.4767614   0.92892953 16.12821544 -1.899208   -1.73526596]]


### IA playing game

In [168]:
def _render_text(env):
    desc = env.desc.tolist()
    outfile = StringIO()

    row, col = env.s // env.ncol, env.s % env.ncol
    desc = [[c.decode("utf-8") for c in line] for line in desc]
    desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
    for i in range(len(desc)):
        for j in range(len(desc[i])):
            if desc[i][j] == 'H':
                desc[i][j] = utils.colorize(desc[i][j], "yellow", highlight=True)
            if desc[i][j] == 'F' or desc[i][j] == 'S':
                desc[i][j] = utils.colorize(desc[i][j], "blue", highlight=True)
            if desc[i][j] == 'G':
                desc[i][j] = utils.colorize(desc[i][j], "green", highlight=True)
                
    if env.lastaction is not None:
        outfile.write(f"  ({['Left', 'Down', 'Right', 'Up'][env.lastaction]})\n")
    else:
        outfile.write("\n")
    outfile.write("\n".join("".join(line) for line in desc) + "\n")
    return outfile.getvalue()
       
for episode in range(1):
    state = env.reset()
    #game = "*** Game {0} ***\n".format(episode + 1)
    time.sleep(1)
    
    episode_reward = 0
    for step in range(max_step):
        clear_output(wait=True)
        taxi_row, taxi_col, pass_idx, dest_idx = env.decode(env.s)
        print(game, env.render('human'), end='\n')
        print(pass_idx, dest_idx)
        taxi_row, taxi_col, pass_idx, dest_idx = env.decode(env.s)
        rend = env.desc.copy()
        if pass_idx < 4:
            rend[1 + env.locs[pass_idx][0]][2 * env.locs[pass_idx][1] + 1] = 'P'
        rend[1 + env.locs[dest_idx][0]][2 * env.locs[dest_idx][1] + 1] = 'D'
        rend[1 + taxi_row][2 * taxi_col + 1] = 'T'
        time.sleep(0.3)
        
        action = agent.get_action(state)
        new_state, reward, done, info = env.step(action)
        state = new_state
        episode_reward += reward
        if done:
            clear_output(wait=True)
            print(game, env.render('ansi'), end='\n')
            if reward == 1:
                #print("*** You Won {0} ***".format(episode_reward))
                time.sleep(3)
            else:
                #print("*** Your score {0} ***".format(episode_reward))
                time.sleep(3)
            clear_output(wait=True)
            break

*** Game 5 ***
 +---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)



### Evaluate

In [1]:
total_steps, total_penalties = 0, 0
episodes = 100
max_steps = 100
for episode in range(episodes):
    state = env.reset()
    nb_steps, penalties, reward = 0, 0, 0
    for step in range(max_steps):
        action = agent.get_action(state)
        state, reward, done, info = env.step(action)
        if reward == -10:
            penalties += 1
        if done == True:
            break
        if step >= max_steps:
            penalties += 1
            break
    total_penalties += penalties
    total_steps += step
        
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_steps / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")  

NameError: name 'env' is not defined