In [1]:
!pip install cmake 'gym[atari]' scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Set up the environment

In [29]:
import gym
from IPython.display import clear_output
from time import sleep
import random
from IPython.display import clear_output
import numpy as np
import pandas as pd
env = gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+



In [31]:
env.reset() # reset environment to a new, random state
env.render()
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


# State encoding and setting

In [32]:
state = env.encode(4, 0, 3, 2)
state

414

In [33]:
state = env.encode(4, 0, 3, 2) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 414
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+



# plotting Frames


In [35]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(1)

#Train the model then tune and decay

In [96]:
%%time
"""Training the agent"""
# Initialize the q table
# Hyperparameters
# alpha = 0.1
# gamma = 0.6
# epsilon = 0.1
def train(env,itr,alpha,gamma,eps):
  q_table = np.zeros([env.observation_space.n, env.action_space.n])
# For plotting metrics
  all_epochs = []
  all_penalties = []
  frames=[]
  for i in range(1, itr):
      state = env.reset()
      if i%5000==0:
        alpha=alpha*(1-0.01)
        gamma=gamma*(1-0.01)
        eps=eps*(1-0.6)

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < eps:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
      
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }) 
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")
  
  print("Training finished.\n")
  return q_table,frames

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.3 µs


# Evalutation

In [97]:
"""Evaluate agent's performance after Q-learning"""
def test(env,episodes,q_table):
  framest=[]
  total_epochs, total_penalties = 0, 0
  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      done = False
      while not done and epochs<1000:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)
          if reward == -10:
              penalties += 1
          epochs += 1
      total_penalties += penalties
      total_epochs += epochs
      framest.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }) 
  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")
  return total_epochs / episodes,total_penalties / episodes

In [98]:
Q,f=train(env,10000,0.8,0.9,0.8)

Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode: 7000
Episode: 7100
Episode: 7200
E

#**testing the Q Knowledge**

In [120]:
test(env,1000,Q)

Results after 1000 episodes:
Average timesteps per episode: 13.24
Average penalties per episode: 0.0


(13.24, 0.0)

#grid search

In [102]:
alphas = [0.7, 0.6,0.5]
gammas = [0.8, 0.6, 0.7]
epsilons = [0.6,0.7,0.8]
results_ = []
def grid(env,alphas,gammas,epsilon):
  n_episodes = 1000
  for alpha in alphas:
    for gamma in gammas:
      for eps in epsilons:
          print(f'alpha: {alpha}, gamma: {gamma},epsilon:{eps}')
          agent,frames = train(env,10000,alpha, gamma,eps)
          timesteps, penalties = test(env,n_episodes,Q)
          results_.append({'timesteps': timesteps, 'penalties': penalties, "parms": (alpha, gamma, eps)})
   

In [103]:
grid(env,alphas,gammas,epsilons)

alpha: 0.7, gamma: 0.8,epsilon:0.6
Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode:

In [105]:
results_

[{'parms': (0.7, 0.8, 0.6), 'penalties': 0.0, 'timesteps': 13.057},
 {'parms': (0.7, 0.8, 0.7), 'penalties': 0.0, 'timesteps': 13.115},
 {'parms': (0.7, 0.8, 0.8), 'penalties': 0.0, 'timesteps': 13.104},
 {'parms': (0.7, 0.6, 0.6), 'penalties': 0.0, 'timesteps': 13.134},
 {'parms': (0.7, 0.6, 0.7), 'penalties': 0.0, 'timesteps': 13.024},
 {'parms': (0.7, 0.6, 0.8), 'penalties': 0.0, 'timesteps': 13.051},
 {'parms': (0.7, 0.7, 0.6), 'penalties': 0.0, 'timesteps': 13.115},
 {'parms': (0.7, 0.7, 0.7), 'penalties': 0.0, 'timesteps': 13.061},
 {'parms': (0.7, 0.7, 0.8), 'penalties': 0.0, 'timesteps': 13.049},
 {'parms': (0.6, 0.8, 0.6), 'penalties': 0.0, 'timesteps': 13.153},
 {'parms': (0.6, 0.8, 0.7), 'penalties': 0.0, 'timesteps': 12.942},
 {'parms': (0.6, 0.8, 0.8), 'penalties': 0.0, 'timesteps': 13.033},
 {'parms': (0.6, 0.6, 0.6), 'penalties': 0.0, 'timesteps': 13.169},
 {'parms': (0.6, 0.6, 0.7), 'penalties': 0.0, 'timesteps': 12.89},
 {'parms': (0.6, 0.6, 0.8), 'penalties': 0.0, 'ti

#sorting for best timesteps

In [110]:
newlist = sorted(results_, key=lambda d: d['timesteps']) 

In [112]:
newlist

[{'parms': (0.6, 0.6, 0.7), 'penalties': 0.0, 'timesteps': 12.89},
 {'parms': (0.5, 0.7, 0.8), 'penalties': 0.0, 'timesteps': 12.907},
 {'parms': (0.5, 0.6, 0.6), 'penalties': 0.0, 'timesteps': 12.939},
 {'parms': (0.6, 0.8, 0.7), 'penalties': 0.0, 'timesteps': 12.942},
 {'parms': (0.6, 0.6, 0.8), 'penalties': 0.0, 'timesteps': 12.97},
 {'parms': (0.7, 0.6, 0.7), 'penalties': 0.0, 'timesteps': 13.024},
 {'parms': (0.6, 0.8, 0.8), 'penalties': 0.0, 'timesteps': 13.033},
 {'parms': (0.7, 0.7, 0.8), 'penalties': 0.0, 'timesteps': 13.049},
 {'parms': (0.7, 0.6, 0.8), 'penalties': 0.0, 'timesteps': 13.051},
 {'parms': (0.7, 0.8, 0.6), 'penalties': 0.0, 'timesteps': 13.057},
 {'parms': (0.7, 0.7, 0.7), 'penalties': 0.0, 'timesteps': 13.061},
 {'parms': (0.5, 0.6, 0.7), 'penalties': 0.0, 'timesteps': 13.067},
 {'parms': (0.5, 0.6, 0.8), 'penalties': 0.0, 'timesteps': 13.083},
 {'parms': (0.6, 0.7, 0.7), 'penalties': 0.0, 'timesteps': 13.093},
 {'parms': (0.5, 0.7, 0.6), 'penalties': 0.0, 'tim

#using best param

In [113]:
Q,f=train(env,10000,0.6,0.6,0.7)
test(env,1000,Q)

Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode: 7000
Episode: 7100
Episode: 7200
E

(13.042, 0.0)