# Creating Taxi Environment from OpenAI gym

In [None]:
import gym
import numpy as np
env = gym.make("Taxi-v3")
state = env.reset()
print(state)
env.render()

121
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



# Possible States and actions

In [None]:
n_states = env.observation_space.n
n_actions = env.action_space.n

# Choosing Action Randomly

In [None]:
state = env.reset()
counter = 0
g = 0
reward = None
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward
print("Solved in {} Steps with a total reward of {}".format(counter,g))


Solved in 4807 Steps with a total reward of -18844


# Optimal Q Value

In [None]:
Q = np.zeros([n_states, n_actions])
Q.shape
# This multidimensional array will keep a history of our Q-Values for all states
Q_hist = np.zeros([n_states, n_actions, 0])
Q_hist.shape
alpha = 0.618
G = 0
episodes = 1000
Q = np.zeros([n_states, n_actions])
rewardTracker = []


for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
    rewardTracker.append(G)
    
    if episode % 100 == 0:
        print('Episode {} Reward: {}   Total Average Reward: {} '.format(episode,G, sum(rewardTracker)/len(rewardTracker)))

Episode 100 Reward: -282   Total Average Reward: -217.05 
Episode 200 Reward: -92   Total Average Reward: -139.52 
Episode 300 Reward: 10   Total Average Reward: -93.81333333333333 
Episode 400 Reward: 10   Total Average Reward: -68.9 
Episode 500 Reward: 4   Total Average Reward: -53.828 
Episode 600 Reward: 7   Total Average Reward: -43.526666666666664 
Episode 700 Reward: 1   Total Average Reward: -36.28142857142857 
Episode 800 Reward: 13   Total Average Reward: -30.76375 
Episode 900 Reward: 7   Total Average Reward: -26.467777777777776 
Episode 1000 Reward: 2   Total Average Reward: -23.071 


# Implement Optimal Q value

In [None]:
state = env.reset()
done = None

while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()

+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| 