## Imports

In [1]:
import numpy as np
import random
import gym
from IPython.display import clear_output

## Creating the environment

In [2]:
# import the environment of Taxi-v3
environment = gym.make("Taxi-v3").env
# displaying the environment
environment.render()

+---------+
|[35mR[0m: | : :[43mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



## No.of states & actions

In [3]:
print(environment.observation_space.n)

print("No.of actions:\t",environment.action_space.n)

500
No.of actions:	 6


## Setting learning rate, discount factor and epsilon

In [4]:
alpha = 0.1
gamma = 0.6
epsilon = 0.1

## Initial Q table

In [5]:
# in q table rows represent states column represents actions
q_table = np.zeros((500,6))
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Train the Q table

In [6]:
num_of_episodes = 100000

for episode in range(0, num_of_episodes): 
    # Reset the environment, to get new situation in every episode (iteration)
    state = environment.reset()

    # Initialize variables    
    reward = 0
    # in each episode it will come to a terminate state to track that
    terminated = False
    
    while not terminated:
        # Take learned path or explore new actions based on the epsilon        
        if random.uniform(0, 1) < epsilon:
            action = environment.action_space.sample()
        else:
            # state represent rows, so when row is given argmax will give the index of highest value column in that row
            action = np.argmax(q_table[state])
            # when action is given it will give these things
            next_state, reward, terminated, info = environment.step(action)

            # calculate q value for that action
            q_value = q_table[state, action]
            
            # np.argmax gives the index of the maximum but np.max gives the maximum value
            max_value = np.max(q_table[next_state])
            # Q learning equation
            new_q_value = (1 - alpha) * q_value + alpha * (reward + gamma * max_value)

            # Update Q-table with the new q value
            q_table[state, action] = new_q_value
            state = next_state 

print(q_table)

[[ 0.          0.          0.          0.          0.          0.        ]
 [-2.32296135 -2.3239767  -2.32222995 -2.3239767  -2.27325184 -2.8816    ]
 [-1.58100479 -1.56146688 -1.57100807 -1.56146688 -0.7504     -1.96      ]
 ...
 [-1.02949658 -0.78884157 -1.02949658 -1.02504367 -1.96       -1.96      ]
 [-2.03112124 -2.04034732 -2.03112124 -2.03246275 -2.8816     -2.8816    ]
 [-0.196      -0.196      -0.196      -0.07       -1.         -1.        ]]


## Test the goodness of the trained table

In [7]:
total_penalties = 0 
num_of_episodes = 100 

for _ in range(num_of_episodes):
    # Reset the environment, to get new situation in every episode (iteration)
    state = environment.reset()
    penalties = 0
    reward = 0
    terminated = False

    while not terminated:
        # state represent rows, so when row is given argmax will give the index of highest value column in that row
        action = np.argmax(q_table[state])
        # when action is given it will give these things
        state, reward, terminated, info = environment.step(action)

        # rewards < -10 were considered as penalties
        if reward <= -10:
            penalties += 1
    
    total_penalties += penalties

print(total_penalties / num_of_episodes) 

0.0
