In [1]:
# Packages needed for this assignment
import random
import gym
import numpy as np
import time
import matplotlib.pyplot as plt
from IPython.display import clear_output # Used to clear the ouput of a Jupyter cell.

In [2]:
def test_policy(agent, env, wait=0.1, max_steps=40, render=True):
    state = env.reset()
    step = 0
    total_reward = 0
    done = False
    while not done and step < max_steps:
        action = agent.act(state)
        state, reward, done, info = env.step(action)
        total_reward += reward
        step += 1
        
        if render:
            clear_output(wait=True)
            env.render()
            # Show some information
            print("Time step:", step)
            print("Reward:", reward)
            print("Total reward:", total_reward)
            time.sleep(wait)
    env.close()
    return total_reward

In [3]:
def render_greedy_policy(Q):
    # Prints an illustration of the greedy policy with respect to Q
    n_states = Q.shape[0]
    greedy = np.full(n_states, 'L') 
    for s in range(n_states):
            a = np.argmax(Q[s,:])
            if a == 0:
                greedy[s] = 'L'
            elif a == 1:
                greedy[s] = 'D'
            elif a == 2:
                greedy[s] = 'R'
            elif a == 3:
                greedy[s] = 'U'

    print(greedy.reshape(5,5))

In [4]:
def train_q(agent, env, n_episodes, max_steps=500000):
    step = 0
    steps = np.zeros(n_episodes) # Steps after each episode
    total_rewards = np.zeros(n_episodes)
    for i in range(n_episodes):
        state = env.reset()
        rewards = 0
        done = False
        while not done:
            action = agent.act(state)
            state_next, reward, done, info = env.step(action)
            agent.learn(state, action, reward, state_next)
            state = state_next
            step += 1
            rewards += reward
            
            if step > max_steps:
                return steps, rewards
            
        steps[i] = step
        total_rewards[i] = rewards
    return total_rewards, steps

In [5]:
class QAgent():
    def __init__(self, n_states, n_actions, gamma, alpha, epsilon):
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((n_states, n_actions))
        
    def act(self, state):
        # Implement the self.epsilon-greedy policy
        choices = [np.argmax(self.Q[state, :]), np.random.choice(self.n_actions)]
        action = np.random.choice(choices, 1, p=[1 - self.epsilon, self.epsilon])
        return action[0]
    
    def learn(self, s, a, r, s_next):
        # Implement the Q-learning update
        self.Q[s, a] = self.Q[s, a] + self.alpha * (r + (self.gamma * np.max(self.Q[s_next, :])) - self.Q[s,a])

In [6]:
env = gym.make('Taxi-v3')
state = env.reset()

In [7]:
agentQ = QAgent(env.observation_space.n, env.action_space.n, gamma=1, alpha=0.1, epsilon=0.1)
train_q(agentQ, env, n_episodes=15000)

(array([-551., -578., -506., ...,    6.,    5.,    2.]),
 array([2.00000e+02, 4.00000e+02, 6.00000e+02, ..., 2.93413e+05,
        2.93429e+05, 2.93439e+05]))

In [8]:
total_reward = test_policy(agentQ, env, max_steps=100)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Time step: 20
Reward: 20
Total reward: 1


In [13]:
#agentQ.epsilon = 0
state=23
#greedy_action = agentQ.act(state)
#print(greedy_action)
print(agentQ.Q[state,:])
print(np.argmax(agentQ.Q[state,:]))

[ 0.69985877  1.1356501  -3.47108066 10.99944946 -5.0505475  -3.76422564]
3
