In [None]:
import numpy as np
from tqdm import tqdm
import gym
import math
import random
import matplotlib.pyplot as plt

In [None]:
env = gym.make('Taxi-v3')
num_actions = env.action_space
num_obs = env.observation_space
R, G, B, Y = env.unwrapped.locs
R, G, B, Y = list(R), list(G), list(B), list(Y)
env.reset()

In [None]:
def get_passenger_position(passenger_id):
    if passenger_id == 0:  
        return R
    elif passenger_id == 1:  
        return G
    elif passenger_id == 2:  
        return B
    elif passenger_id == 3:  
        return Y
    
    
def get_state(state):
    row, col, pass_id, dest_id = env.unwrapped.decode(state)
    state = np.asarray([row,col,pass_id,dest_id])
    return state

print(R,G,B,Y)

In [None]:
curr_state = get_state(env.s)
pass_pose = get_passenger_position(curr_state[2])
dest_pose = get_passenger_position(curr_state[3])
print("Taxi at",curr_state[0:2])
print("Passenger at",dest_pose)
print("Destination",pass_pose)
next_state, reward, done, _, _ = env.step(4)
row, col, pass_id, dest_id = env.unwrapped.decode(next_state)
print(curr_state[2])

In [None]:
Q_r = np.zeros((500,6))
Q_b = np.zeros((500,6))
Q_g = np.zeros((500,6))
Q_y = np.zeros((500,6))
Q = np.zeros((500,10))

In [None]:
# Set the number of episodes and maximum number of steps per episode
EPS = 10000
MAX_STEPS = 100

# Set the learning rate, discount factor, and exploration rate
ALPHA = 0.1
GAMMA = 0.99
exploration_rate = 1.0
MIN_EXP = 0.01
EXP_DECAY = 0.01

In [None]:
# Four different Q value functions for the four options
def choose_action_red(state,Q_r):
    action = np.argmax(Q_r[state])
    return action

def choose_action_blue(state,Q_b):
    action = np.argmax(Q_b[state])
    return action

def choose_action_green(state,Q_g):
    action = np.argmax(Q_g[state])
    return action

def choose_action_yellow(state,Q_y):
    action = np.argmax(Q_y[state])
    return action

def choose_action(q,state):
    if not q[state].any():
        return random.randint(0,7)
    action = np.argmax(q[state])
    
    if np.random.rand() < exploration_rate:
        action = np.random.randint(0,7)
        return action
    return action

    

In [None]:
# Options where policy is greedy wrt the corresponding Q Value function
def Red(Q_r,state):
    optdone = False
    optact = choose_action_red(state,Q_r)
    state = get_state(state)
    if state[0:2] is R:
        optdone = True
    return optact,optdone

def Green(Q_g,state):
    optdone = False
    optact = choose_action_green(state,Q_g)
    state = get_state(state)
    if state[0:2] is G:
        optdone = True
    return optact,optdone
    
def Yellow(Q_y,state):
    optdone = False
    optact = choose_action_green(state,Q_y)
    state = get_state(state)
    if state[0:2] is Y:
        optdone = True
    return optact,optdone 

def Blue(Q_b,state):
    optdone = False
    optact = choose_action_green(state,Q_b)
    state = get_state(state)
    if state[0:2] is B:
        optdone = True
    return optact,optdone 

In [None]:
# Iterate over episodes
for episode in tqdm(range(EPS)):
    state, _ = env.reset()
    done = False
    total_reward = 0
    steps = 0
    while not done and steps < MAX_STEPS:
        steps += 1
        action = choose_action(Q,state)
        if action < 6:
            next_state, reward, done, _, _ = env.step(action)
            Q[state,action] = Q[state, action] 
            + ALPHA * (reward + GAMMA * np.max(Q[next_state, :])
                                                                  - Q[state, action])
            total_reward += reward
            state = next_state
        
        reward_bar = 0
        if action > 5 and action < 10:
            count = 0
            optdone = False
            current_state = state
            while (optdone == False):
                if action == 6:
                    optact, optdone = Red(Q_r,state) 
                    q = Q_r
                    
                if action == 7:
                    optact, optdone = Green(Q_g,state)
                    q = Q_g
                    
                if action == 8:
                    optact, optdone = Blue(Q_b,state)
                    q = Q_b
                    
                if action == 9:
                    optact, optdone = Yellow(Q_y,state)
                    q = Q_y
                    
                next_state, reward, done, _, _ = env.step(optact)
                q[state,optact] = q[state, optact] 
                + ALPHA * (reward + GAMMA * np.max(q[next_state, :])
                                                                  - q[state, optact])
                
                reward_bar = GAMMA*reward_bar + reward
                count += 1
                if optdone == True:
                    Q[current_state, action] += ALPHA * (reward_bar 
                            - Q[current_state, action] 
                            + GAMMA**count * np.max(Q[next_state, :]))
                state = next_state
        

    # Decay the exploration rate
    exploration_rate = MIN_EXP + (1 - MIN_EXP) * np.exp(-EXP_DECAY * episode)

    # Print the total reward for each episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Print the final Q-table
print("Final Q-table:")
print(Q)