In [254]:
# imports
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam,SGD
from collections import deque
import random
from matplotlib import pyplot as plt
import copy
import numpy as np
import gym
from torchsummary import summary
import warnings
warnings.filterwarnings('ignore')

# set seed for torch library
torch.manual_seed(33)

<torch._C.Generator at 0x7fe74291e110>

In [255]:
# make the taxi environment and verify
env = gym.make("Taxi-v3")
env

<TimeLimit<TaxiEnv<Taxi-v3>>>

In [256]:
# view observation space and action space
obs_space = env.observation_space
print(f"Observation space: {obs_space}")
act_space = env.action_space
print(f"Action space: {act_space}")

Observation space: Discrete(500)
Action space: Discrete(6)


In [257]:
# test random environment, action, and step
obs_test = env.reset()
act_test = env.action_space.sample()
nst,rt,dt,_=env.step(act_test)
print(f"Initial observation: {obs_test}\nAction: {act_test}\nNew observation: {nst,rt,dt}")
exp_test = [obs_test,nst,rt,dt]
print(f"Exp: {exp_test}")

Initial observation: 101
Action: 1
New observation: (1, -1, False)
Exp: [101, 1, -1, False]


In [258]:
# create test q-table with initial values as 0 to preview and test q action
Q_test = np.zeros((500,6)) # environment has 500 states and 6 actions
# Q_test[state][action]
state_test = 2
Q_test[state_test] = [1,2,10,22,5,3]
a_test = np.argmax(Q_test[state_test])
print(f"Max Q for state {state_test}: {np.max(Q_test[state_test])}")
print(f"Corresponding action: {a_test}")
print(f"Q:\n{Q_test}\nShape of Q: {Q_test.shape}")
print(f"Q[2,3] = {Q_test[2,4]}")
print(Q_test[2])

Max Q for state 2: 22.0
Corresponding action: 3
Q:
[[ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 1.  2. 10. 22.  5.  3.]
 ...
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]]
Shape of Q: (500, 6)
Q[2,3] = 5.0
[ 1.  2. 10. 22.  5.  3.]


In [259]:
# test random action
for i in range(10):
    test_a = env.action_space.sample()
    print(test_a)

5
3
3
3
1
3
3
3
1
4


In [260]:
# create actual Q table
Q = np.zeros((500,6)) # environment has 500 states and 6 actions
print(f"Q:\n{Q}\nQ shape: {Q.shape}")

Q:
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Q shape: (500, 6)


In [261]:
# explore / exploit

def action(state,epsilon=0.5):
    
    if random.uniform(0,1) < epsilon: # choose random action if random number is less than epsilon
        a1 = env.action_space.sample()
        return a1
    
    else: # choose action with max Q value if random number is greater than epsilon
        if np.max(Q[state]) > 0: # choose action with max Q value if max Q for state is greater than 0
            a2 = np.argmax(Q[state]) # action corresponding to max Q for that state
            return a2
        else: # otherwise random action
            a3 = env.action_space.sample()
            return a3

In [265]:
# Q-learning

def q_learning(episodes,lr,gamma):
    #print(state,action)
    for i in range(episodes):
        # if i%1000 == 0:
        #     print(f"Episode {i}:")
        obs = env.reset()
        d = False
        while d == False:
            act = action(obs)
            ns,r,d,_=env.step(act)
            exp = [obs,ns,r,d]
            #print(f"exp: {exp}")
            Q[obs,act] = Q[obs,act] + lr * (exp[2] + gamma * np.max(Q[ns,:]) - Q[obs,act]) # bellman eqn
            obs = ns
        if i%1000 == 0:
            print(f"\tend of episode {i}")
    print(f"\nEND OF ALL EPISODES\n\nFinal Q-values:\n{Q}")

In [266]:
# initialize parameters
lr = 1
loops = 100000

# run episodes
print("Starting learning:")
q_learning(loops,lr,0.5)

Starting learning:
	end of episode 0
	end of episode 1000
	end of episode 2000
	end of episode 3000
	end of episode 4000
	end of episode 5000
	end of episode 6000
	end of episode 7000
	end of episode 8000
	end of episode 9000
	end of episode 10000
	end of episode 11000
	end of episode 12000
	end of episode 13000
	end of episode 14000
	end of episode 15000
	end of episode 16000
	end of episode 17000
	end of episode 18000
	end of episode 19000
	end of episode 20000
	end of episode 21000
	end of episode 22000
	end of episode 23000
	end of episode 24000
	end of episode 25000
	end of episode 26000
	end of episode 27000
	end of episode 28000
	end of episode 29000
	end of episode 30000
	end of episode 31000
	end of episode 32000
	end of episode 33000
	end of episode 34000
	end of episode 35000
	end of episode 36000
	end of episode 37000
	end of episode 38000
	end of episode 39000
	end of episode 40000
	end of episode 41000
	end of episode 42000
	end of episode 43000
	end of episode 44000
	end

In [289]:
# test the Q-table values

epochs = 0
penalties, reward = 0, 0
frames = []

obs = env.reset()
env.s = obs
d = False
while d == False:
    act = np.argmax(Q[obs])
    ns,r,d,_=env.step(act)
    exp = [obs,ns,r,d]
    if reward == -10:
        penalities += 1
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': ns,
        'action': act,
        'reward': r
        }
    )
    epochs += 1
    obs = ns
   
print(f"Timesteps taken: {epochs}")
print(f"Penalties incurred: {penalties}")

Timesteps taken: 13
Penalties incurred: 0


In [290]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 13
State: 85
Action: 5
Reward: 20


In [295]:
# run n episodes
trials = 100
frames100 = []

for i in range(trials):
    
    epochs = 0
    penalties, reward = 0, 0

    obs = env.reset()
    env.s = obs
    d = False
    while d == False:
        act = np.argmax(Q[obs])
        ns,r,d,_=env.step(act)
        exp = [obs,ns,r,d]
        if reward == -10:
            penalities += 1
        frames100.append({
            'frame': env.render(mode='ansi'),
            'state': ns,
            'action': act,
            'reward': r
            }
        )
        epochs += 1
        obs = ns

print_frames(frames100)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 1308
State: 410
Action: 5
Reward: 20


In [297]:
print(f"Total penalties for {trials} episodes: {penalties}")

Total penalties for 100 episodes: 0
