In [18]:
!pip install cmake 'gym[atari]' scipy



In [0]:
import numpy as np
import gym
import random

from time import sleep
from IPython.display import clear_output

In [0]:
def random_actions(frames):
  env.s = 328  # set environment to illustration's state

  epochs = 0
  penalties, reward = 0, 0

  done = False

  while not done:
      action = env.action_space.sample()
      state, reward, done, info = env.step(action)

      if reward == -10:
          penalties += 1
      
      # Put each rendered frame into dict for animation
      frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
      )

      epochs += 1
      
      
  print("Timesteps taken: {}".format(epochs))
  print("Penalties incurred: {}".format(penalties))

def print_frames(frames):
  for i, frame in enumerate(frames):
      clear_output(wait=True)
      print(frame['frame'])
      print(f"Timestep: {i + 1}")
      print(f"State: {frame['state']}")
      print(f"Action: {frame['action']}")
      print(f"Reward: {frame['reward']}")
      if (i == 29):
        print('Force break!')
        break
      sleep(.5)

def training_agent():
  %%time

  # Hyperparameters
  alpha = 0.1
  gamma = 0.6
  epsilon = 0.1

  # For plotting metrics
  all_epochs = []
  all_penalties = []

  for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")

  print("Training finished.\n")  

def evaluate_agents():
  total_epochs, total_penalties = 0, 0
  episodes = 100

  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      
      while not done:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)

          if reward == -10:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs

  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")

def evaluate_agent(frames):
    state = env.reset()
    reward = 0    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
      )

In [4]:
env = gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [5]:
env.reset()
env.render()

+---------+
|[35mR[0m: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [6]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [7]:
#This dictionary has the structure {action: [(probability, nextstate, reward, done)]}.
#The 0-5 corresponds to the actions (south, north, east, west, pickup, dropoff)

env.P[328] 

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [42]:
frames = [] # for animation
random_actions(frames)

Timesteps taken: 461
Penalties incurred: 158


In [43]:
print_frames(frames)

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)

Timestep: 30
State: 328
Action: 3
Reward: -1
Force break!


In [26]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])
training_agent()

Episode: 100000
Training finished.



In [27]:
q_table[328]

array([ -2.41479814,  -2.27325184,  -2.40153538,  -2.36140151,
       -11.20432524, -11.03716745])

In [28]:
np.max(q_table[328])

-2.273251840000004

In [29]:
evaluate_agents()

Results after 100 episodes:
Average timesteps per episode: 12.65
Average penalties per episode: 0.0


In [0]:
frames = []
evaluate_agent(frames)

In [33]:
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 12
State: 0
Action: 5
Reward: 20
