In [1]:
!pip install gym
!pip install pygame


Collecting gym
  Downloading gym-0.23.1.tar.gz (626 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting importlib-metadata>=4.10.0
  Downloading importlib_metadata-4.10.0-py3-none-any.whl (17 kB)
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.6-py3-none-any.whl (2.7 kB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517): started
  Building wheel for gym (PEP 517): finished with status 'done'
  Created wheel for gym: filename=gym-0.23.1-py3-none-any.whl size=701375 sha256=12cf28ac4c1fa1efc8b775b5ca45cf61961dede82a1ae281265ccbf53801dc22
  Stored in directory: c:\users\asus\appdata\local\pip\cache\wheels\4e\be\7e\92a54668db96883e38ce60a9249dc55de7cd6eee49e7311940
Successfully bu

In [4]:
import gym
import numpy as np
import random
from IPython.display import clear_output


In [9]:
env = gym.make('Taxi-v3')
env.reset()
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+



In [10]:
# Values for Q Table:

action_size = env.action_space.n
print('Action Space: ', action_size)

state_size = env.observation_space.n
print('State Size: ', state_size)

Action Space:  6
State Size:  500


In [11]:
# Build Q Table:

q_table = np.zeros((state_size, action_size))
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [12]:
# Hyper params:

total_ep = 1500 # number of ep we use to train the agent
total_test_ep = 100 # episodes to test
max_steps = 100 # max steps per episode

lr = 0.81 # learning rate
gamma = 0.96 # discount rate

# Exploration Params:

epsilon = 0.9 # exploration rate (jadi episode 1 harus banyak explor, tp makin lama makin berkurang eksplornya)
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01 # rate dari epsilon

![alt text](https://cdn-media-1.freecodecamp.org/images/0*voKUaGu68-cDuncy.)

In [13]:
# Implementing the Q Learning Algorithm:

for episode in range(total_ep):

  # Reset Environment:
  state = env.reset()
  step = 0
  done = False

  for step in range(max_steps):
    # loop, kalo steps < max steps, maka bikin step
    # env.render()
    # clear_output(wait=True)

    # Choose an action a in the current world state(s) (step 3)
    # First we randomize a number
    exp_exp_tradeoff = random.uniform(0, 1)

    # If this number > greater than epsilon --> exploitation (taking the biggest q value for the current state):
    if exp_exp_tradeoff > epsilon:
      action = np.argmax(q_table[state, :])

    # Else, doing random choice:
    else:
      action = env.action_space.sample()

    # Take the action (a) and observe the outcome state (s') and the reward (r)
    new_state, reward, done, info = env.step(action)

    # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    q_table[state, action] = q_table[state, action] + lr * (reward + gamma * 
                                    np.max(q_table[new_state, :]) - q_table[state, action])
    
    # Our new state:
    state = new_state

    # If done True, finish the episode:
    if done == True:
      break

  # Increment number of episodes:
  episode += 1

  # Reduce epsilon (because we need less and less exploration):
  epsilon = min_epsilon + (max_epsilon - min_epsilon) *np.exp(-decay_rate*episode)

 after 1500 episodes, kita punya Q table sbg cheatsheet
 

In [14]:
# Using Q Table: (training agent)

env.reset()
rewards = []

for episode in range(total_test_ep):
  state = env.reset()
  step = 0
  done = False
  total_rewards = 0
  print('=========================')
  print('EPISODE: ', episode)

  for step in range(max_steps):

    env.render()

    # Take the action based on the Q Table:
    action = np.argmax(q_table[state, :])

    new_state, reward, done, info = env.step(action)

    total_rewards += reward

    # If episode finishes:
    if done:
      rewards.append(total_rewards)
      print('Score: ', total_rewards)
      break

    state = new_state

env.close()
print('Score Over Time: {}'.format(sum(rewards)/total_test_ep))

EPISODE:  0
+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|[34;1mR[0m:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|

  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | :[42m_[0m|
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : :[42m_[0m|
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
Score:  12
EPISODE:  68
+---------+
|[35mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : | : : |
|[43m [0m: 