# Using SARSA

## 1. Import all necessary libraries and initialize the environment

In [1]:
import random
import gym
from IPython import display
import time
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
env = gym.make('Taxi-v2')

### Environment result

In [2]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



* The letters (R, G, Y, B) represents the different locations 
* A yellow colored rectangle is the taxi driving by our agent

## 3. Define Q table

In [3]:
Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s,a)] = 0.0

* Q table has a dictionary which stores state-action pair specifying value of performing an action in a state s
* s -> 500개 = 25 taxi positions x 5 possible locations of the passenger x 4 destination locations
* a -> 6개 (actions) : 0 - move south, 1 - north, 2 - east, 3 - west, 4 - pickup passenger, 5 - dropoff passenger

## 4. Define a function for performing epsilon-greedy policy

In [8]:
def epsilon_greedy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

* 0에서 1사이의 값을 uniform distribution에서 randomly 생성한 것이 epsilon보다 작으면 explore
* 아니면 exploit
* max()부분 : 가능한 action 개수만큼의 범위를 0부터 list화 한 후, 각각을 x에 대입한 Q[(state, x)] 값 중 가장 큰 값을 반환

## 5. Initialize necessary variables
* alpha = TD learning rate
* gamma = discount factor
* epsilon = epsilon value in epsilon greedy policy

In [25]:
alpha = 0.4
gamma = 0.99
epsilon = 0.02

## 6. Perform SARSA

In [27]:
for i in range(4000):
    # store cumulative reward of each episodes in r
    r = 0
    
    # initialize the state
    state = env.reset()
    
    # select the action using epsilon-greedy policy
    action = epsilon_greedy(state,epsilon)
    
    while True:
        # Perform the action in the state and move the next state
        nextstate, reward, done, _ = env.step(action)
        
        # Pick up the next action using epsilon greedy policy
        nextaction = epsilon_greedy(nextstate,epsilon)
        
        # Calculate the Q value of previous state using our update rule
        Q[(state, action)] += alpha * (reward + gamma * Q[(nextstate, nextaction)] - Q[(state, action)])
        
        # Update our state and action with next action and next state
        action = nextaction
        state = nextstate
        
        # store the rewards
        r += reward
        
        # we will break the loop, if we are at the terminal state of the episode
        if done:
            break
    
    print("total reward: ", r)

env.close()

total reward:  -14
total reward:  7
total reward:  10
total reward:  4
total reward:  9
total reward:  11
total reward:  8
total reward:  8
total reward:  10
total reward:  6
total reward:  5
total reward:  10
total reward:  8
total reward:  9
total reward:  6
total reward:  5
total reward:  9
total reward:  9
total reward:  11
total reward:  7
total reward:  4
total reward:  11
total reward:  6
total reward:  5
total reward:  7
total reward:  6
total reward:  11
total reward:  11
total reward:  -3
total reward:  8
total reward:  6
total reward:  7
total reward:  9
total reward:  10
total reward:  1
total reward:  9
total reward:  4
total reward:  7
total reward:  7
total reward:  -3
total reward:  -8
total reward:  9
total reward:  11
total reward:  3
total reward:  6
total reward:  8
total reward:  7
total reward:  7
total reward:  7
total reward:  7
total reward:  5
total reward:  7
total reward:  7
total reward:  15
total reward:  8
total reward:  9
total reward:  12
total reward: 

total reward:  12
total reward:  8
total reward:  7
total reward:  9
total reward:  -1
total reward:  6
total reward:  7
total reward:  -1
total reward:  6
total reward:  4
total reward:  10
total reward:  10
total reward:  8
total reward:  9
total reward:  11
total reward:  -6
total reward:  -2
total reward:  9
total reward:  11
total reward:  4
total reward:  8
total reward:  5
total reward:  8
total reward:  -1
total reward:  -4
total reward:  10
total reward:  7
total reward:  12
total reward:  6
total reward:  11
total reward:  -6
total reward:  5
total reward:  7
total reward:  9
total reward:  12
total reward:  3
total reward:  9
total reward:  9
total reward:  11
total reward:  8
total reward:  7
total reward:  9
total reward:  10
total reward:  -16
total reward:  6
total reward:  8
total reward:  6
total reward:  7
total reward:  9
total reward:  11
total reward:  7
total reward:  3
total reward:  0
total reward:  6
total reward:  9
total reward:  -3
total reward:  11
total re

total reward:  6
total reward:  6
total reward:  11
total reward:  11
total reward:  6
total reward:  11
total reward:  -11
total reward:  11
total reward:  11
total reward:  9
total reward:  -5
total reward:  9
total reward:  7
total reward:  4
total reward:  5
total reward:  8
total reward:  4
total reward:  7
total reward:  9
total reward:  6
total reward:  8
total reward:  7
total reward:  7
total reward:  0
total reward:  6
total reward:  10
total reward:  -2
total reward:  10
total reward:  -2
total reward:  7
total reward:  7
total reward:  -11
total reward:  -3
total reward:  5
total reward:  7
total reward:  11
total reward:  7
total reward:  7
total reward:  -5
total reward:  5
total reward:  6
total reward:  -51
total reward:  9
total reward:  12
total reward:  10
total reward:  9
total reward:  8
total reward:  9
total reward:  9
total reward:  8
total reward:  5
total reward:  14
total reward:  4
total reward:  9
total reward:  10
total reward:  6
total reward:  6
total re

total reward:  8
total reward:  4
total reward:  9
total reward:  7
total reward:  13
total reward:  7
total reward:  9
total reward:  9
total reward:  6
total reward:  9
total reward:  9
total reward:  11
total reward:  7
total reward:  7
total reward:  9
total reward:  9
total reward:  7
total reward:  3
total reward:  11
total reward:  8
total reward:  2
total reward:  6
total reward:  7
total reward:  6
total reward:  9
total reward:  1
total reward:  9
total reward:  12
total reward:  11
total reward:  9
total reward:  9
total reward:  6
total reward:  5
total reward:  3
total reward:  4
total reward:  8
total reward:  10
total reward:  5
total reward:  -6
total reward:  4
total reward:  5
total reward:  7
total reward:  8
total reward:  5
total reward:  12
total reward:  9
total reward:  13
total reward:  6
total reward:  9
total reward:  15
total reward:  10
total reward:  10
total reward:  5
total reward:  6
total reward:  14
total reward:  8
total reward:  12
total reward:  6


total reward:  4
total reward:  8
total reward:  10
total reward:  10
total reward:  5
total reward:  5
total reward:  7
total reward:  10
total reward:  5
total reward:  10
total reward:  9
total reward:  7
total reward:  -3
total reward:  10
total reward:  0
total reward:  5
total reward:  10
total reward:  3
total reward:  9
total reward:  6
total reward:  -4
total reward:  5
total reward:  7
total reward:  7
total reward:  9
total reward:  6
total reward:  4
total reward:  -132
total reward:  6
total reward:  11
total reward:  8
total reward:  6
total reward:  -3
total reward:  12
total reward:  11
total reward:  6
total reward:  5
total reward:  7
total reward:  6
total reward:  -166
total reward:  8
total reward:  1
total reward:  4
total reward:  -18
total reward:  9
total reward:  4
total reward:  3
total reward:  11
total reward:  6
total reward:  9
total reward:  1
total reward:  10
total reward:  6
total reward:  8
total reward:  11
total reward:  13
total reward:  11
total 

total reward:  -434
total reward:  7
total reward:  -524
total reward:  11
total reward:  10
total reward:  11
total reward:  7
total reward:  6
total reward:  13
total reward:  7
total reward:  6
total reward:  7
total reward:  -524
total reward:  7
total reward:  12
total reward:  6
total reward:  10
total reward:  -506
total reward:  -569
total reward:  12
total reward:  10
total reward:  10
total reward:  -623
total reward:  7
total reward:  -4
total reward:  8
total reward:  7
total reward:  7
total reward:  -4
total reward:  8
total reward:  8
total reward:  8
total reward:  6
total reward:  12
total reward:  8
total reward:  3
total reward:  7
total reward:  5
total reward:  -226
total reward:  6
total reward:  8
total reward:  12
total reward:  13
total reward:  4
total reward:  6
total reward:  8
total reward:  5
total reward:  -348
total reward:  8
total reward:  -2
total reward:  -149
total reward:  3
total reward:  7
total reward:  12
total reward:  9
total reward:  -100
to

total reward:  8
total reward:  9
total reward:  4
total reward:  8
total reward:  0
total reward:  11
total reward:  13
total reward:  -5
total reward:  13
total reward:  8
total reward:  11
total reward:  10
total reward:  2
total reward:  11
total reward:  11
total reward:  3
total reward:  -3
total reward:  10
total reward:  7
total reward:  6
total reward:  13
total reward:  6
total reward:  8
total reward:  11
total reward:  12
total reward:  9
total reward:  8
total reward:  7
total reward:  6
total reward:  8
total reward:  8
total reward:  10
total reward:  9
total reward:  12
total reward:  11
total reward:  -2
total reward:  4
total reward:  8
total reward:  11
total reward:  6
total reward:  9
total reward:  9
total reward:  12
total reward:  9
total reward:  -2
total reward:  8
total reward:  4
total reward:  5
total reward:  8
total reward:  11
total reward:  7
total reward:  -14
total reward:  11
total reward:  -315
total reward:  10
total reward:  7
total reward:  8
tot

total reward:  11
total reward:  8
total reward:  2
total reward:  5
total reward:  3
total reward:  8
total reward:  10
total reward:  6
total reward:  8
total reward:  7
total reward:  10
total reward:  10
total reward:  8
total reward:  6
total reward:  5
total reward:  9
total reward:  8
total reward:  7
total reward:  9
total reward:  -3
total reward:  13
total reward:  10
total reward:  4
total reward:  5
total reward:  12
total reward:  3
total reward:  8
total reward:  6
total reward:  7
total reward:  7
total reward:  8
total reward:  8
total reward:  7
total reward:  10
total reward:  11
total reward:  9
total reward:  12
total reward:  11
total reward:  5
total reward:  6
total reward:  11
total reward:  4
total reward:  8
total reward:  -2
total reward:  8
total reward:  11
total reward:  11
total reward:  11
total reward:  6
total reward:  6
total reward:  7
total reward:  6
total reward:  8
total reward:  11
total reward:  6
total reward:  5
total reward:  8
total reward:

* episode를 4000번 반복하면서 점점 reward가 큰 값으로 수렴함

## 7. Use the learned Q-table to see the optimal policy

In [28]:
state = env.reset()
total_reward = 0

while True:
    # select greedy action because now the policy is optimal
    action = max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
    env.render()
    
    state, reward, done, _ = env.step(action)
    total_reward += reward
    print('Episode Reward= ', total_reward)
    time.sleep(0.5)
    
    if done:
        break

env.close()

<matplotlib.figure.Figure at 0x7f67e7d3be10>

+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
Episode Reward=  8


<matplotlib.figure.Figure at 0x7f67e7d3be10>

# using Q-learning

## 3. Define Q table

In [29]:
q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        q[(s,a)] = 0.0

* q table : will update the Q values according to our **Q learning update rule**

## 4. Define a function which updates Q table

In [34]:
def update_q_table(prev_state, action, reward, nextstate, alpha, gamma):
    qa = max([q[(nextstate, a)] for a in range(env.action_space.n)])
    q[(prev_state, action)] += alpha * (reward + gamma * qa - q[(prev_state, action)])

* Q-learning에서의 update rule

## 5. Define a function for performing epsilon-greedy policy

In [31]:
def epsilon_greedy_policy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: q[(state, x)])

## 6. Initialize necessary variables

In [32]:
alpha = 0.4
gamma = 0.99
epsilon = 0.02

## 7. Perform Q learning

In [35]:
for i in range(8000):
    r = 0
    state = env.reset()
    
    while True:
        # In each state, select the action by epsilon-greedy policy
        action = epsilon_greedy_policy(state, epsilon)
        
        # Perform the action and move to the next state, and receive the reward
        nextstate, reward, done, _ = env.step(action)
        
        # Update the Q value using our update_q_table function which updates the Q value by Q learning update rule
        update_q_table(state, action, reward, nextstate, alpha, gamma)
        
        # Update the previous state as next state
        state = nextstate
        
        # Store all the rewards obtained
        r += reward
        
        # Break the loop if we are at the terminal state of the episode
        if done:
            break
        
    print("total reward: ", r)

env.close()

total reward:  -560
total reward:  -551
total reward:  -542
total reward:  -569
total reward:  -578
total reward:  -425
total reward:  -497
total reward:  -578
total reward:  -353
total reward:  -524
total reward:  -371
total reward:  -335
total reward:  -398
total reward:  -352
total reward:  -317
total reward:  -146
total reward:  -551
total reward:  -299
total reward:  -578
total reward:  -169
total reward:  -479
total reward:  -560
total reward:  -301
total reward:  -308
total reward:  -398
total reward:  -209
total reward:  -272
total reward:  -362
total reward:  -263
total reward:  -551
total reward:  -209
total reward:  -209
total reward:  -150
total reward:  -165
total reward:  7
total reward:  -308
total reward:  -200
total reward:  -187
total reward:  -209
total reward:  -92
total reward:  -288
total reward:  -209
total reward:  -218
total reward:  -209
total reward:  -155
total reward:  -132
total reward:  -495
total reward:  -218
total reward:  -90
total reward:  -209
total

total reward:  -14
total reward:  9
total reward:  -1
total reward:  0
total reward:  11
total reward:  8
total reward:  10
total reward:  7
total reward:  9
total reward:  9
total reward:  11
total reward:  11
total reward:  6
total reward:  11
total reward:  7
total reward:  -3
total reward:  -4
total reward:  12
total reward:  -2
total reward:  9
total reward:  10
total reward:  1
total reward:  6
total reward:  8
total reward:  11
total reward:  -13
total reward:  10
total reward:  8
total reward:  8
total reward:  8
total reward:  12
total reward:  8
total reward:  7
total reward:  -3
total reward:  9
total reward:  8
total reward:  11
total reward:  4
total reward:  9
total reward:  4
total reward:  8
total reward:  13
total reward:  11
total reward:  15
total reward:  6
total reward:  8
total reward:  -17
total reward:  8
total reward:  6
total reward:  9
total reward:  13
total reward:  6
total reward:  8
total reward:  11
total reward:  10
total reward:  7
total reward:  -22
t

total reward:  5
total reward:  4
total reward:  5
total reward:  9
total reward:  10
total reward:  13
total reward:  8
total reward:  9
total reward:  6
total reward:  6
total reward:  9
total reward:  8
total reward:  9
total reward:  9
total reward:  3
total reward:  7
total reward:  7
total reward:  6
total reward:  13
total reward:  7
total reward:  11
total reward:  8
total reward:  9
total reward:  5
total reward:  9
total reward:  10
total reward:  -2
total reward:  10
total reward:  11
total reward:  6
total reward:  9
total reward:  13
total reward:  8
total reward:  10
total reward:  5
total reward:  9
total reward:  10
total reward:  6
total reward:  4
total reward:  9
total reward:  9
total reward:  10
total reward:  9
total reward:  3
total reward:  -4
total reward:  8
total reward:  6
total reward:  10
total reward:  8
total reward:  -3
total reward:  11
total reward:  9
total reward:  5
total reward:  12
total reward:  5
total reward:  9
total reward:  4
total reward: 

total reward:  10
total reward:  12
total reward:  4
total reward:  -3
total reward:  9
total reward:  7
total reward:  4
total reward:  -1
total reward:  9
total reward:  -3
total reward:  12
total reward:  6
total reward:  13
total reward:  12
total reward:  5
total reward:  3
total reward:  3
total reward:  10
total reward:  9
total reward:  11
total reward:  14
total reward:  14
total reward:  11
total reward:  6
total reward:  9
total reward:  7
total reward:  7
total reward:  6
total reward:  5
total reward:  9
total reward:  6
total reward:  8
total reward:  10
total reward:  7
total reward:  11
total reward:  7
total reward:  1
total reward:  6
total reward:  6
total reward:  8
total reward:  9
total reward:  7
total reward:  7
total reward:  12
total reward:  8
total reward:  10
total reward:  10
total reward:  11
total reward:  8
total reward:  10
total reward:  8
total reward:  7
total reward:  6
total reward:  10
total reward:  11
total reward:  10
total reward:  7
total re

total reward:  -7
total reward:  5
total reward:  7
total reward:  11
total reward:  6
total reward:  9
total reward:  6
total reward:  7
total reward:  12
total reward:  7
total reward:  3
total reward:  7
total reward:  7
total reward:  8
total reward:  12
total reward:  7
total reward:  9
total reward:  8
total reward:  9
total reward:  6
total reward:  6
total reward:  12
total reward:  14
total reward:  12
total reward:  12
total reward:  5
total reward:  14
total reward:  9
total reward:  9
total reward:  8
total reward:  10
total reward:  3
total reward:  13
total reward:  9
total reward:  6
total reward:  6
total reward:  6
total reward:  7
total reward:  7
total reward:  11
total reward:  8
total reward:  9
total reward:  5
total reward:  10
total reward:  11
total reward:  9
total reward:  4
total reward:  14
total reward:  7
total reward:  -1
total reward:  -18
total reward:  11
total reward:  6
total reward:  4
total reward:  11
total reward:  12
total reward:  8
total rewa

total reward:  8
total reward:  1
total reward:  11
total reward:  10
total reward:  5
total reward:  4
total reward:  7
total reward:  -7
total reward:  8
total reward:  11
total reward:  9
total reward:  9
total reward:  -5
total reward:  8
total reward:  10
total reward:  11
total reward:  6
total reward:  8
total reward:  6
total reward:  13
total reward:  6
total reward:  10
total reward:  9
total reward:  4
total reward:  9
total reward:  9
total reward:  12
total reward:  10
total reward:  12
total reward:  8
total reward:  -3
total reward:  3
total reward:  10
total reward:  7
total reward:  8
total reward:  6
total reward:  6
total reward:  8
total reward:  7
total reward:  11
total reward:  3
total reward:  10
total reward:  6
total reward:  10
total reward:  10
total reward:  8
total reward:  7
total reward:  1
total reward:  6
total reward:  5
total reward:  8
total reward:  7
total reward:  12
total reward:  9
total reward:  6
total reward:  9
total reward:  -7
total rewar

total reward:  1
total reward:  10
total reward:  11
total reward:  9
total reward:  6
total reward:  11
total reward:  10
total reward:  -1
total reward:  7
total reward:  3
total reward:  7
total reward:  3
total reward:  13
total reward:  7
total reward:  14
total reward:  8
total reward:  6
total reward:  8
total reward:  9
total reward:  8
total reward:  6
total reward:  9
total reward:  7
total reward:  5
total reward:  7
total reward:  9
total reward:  6
total reward:  6
total reward:  7
total reward:  5
total reward:  12
total reward:  3
total reward:  8
total reward:  -15
total reward:  4
total reward:  13
total reward:  7
total reward:  -2
total reward:  -3
total reward:  5
total reward:  7
total reward:  2
total reward:  7
total reward:  5
total reward:  10
total reward:  13
total reward:  5
total reward:  14
total reward:  7
total reward:  9
total reward:  -3
total reward:  6
total reward:  9
total reward:  7
total reward:  11
total reward:  4
total reward:  -5
total reward

total reward:  7
total reward:  7
total reward:  12
total reward:  9
total reward:  3
total reward:  9
total reward:  7
total reward:  9
total reward:  10
total reward:  11
total reward:  5
total reward:  2
total reward:  10
total reward:  7
total reward:  3
total reward:  3
total reward:  6
total reward:  7
total reward:  -1
total reward:  9
total reward:  9
total reward:  9
total reward:  -2
total reward:  9
total reward:  5
total reward:  6
total reward:  5
total reward:  -2
total reward:  9
total reward:  15
total reward:  8
total reward:  8
total reward:  6
total reward:  5
total reward:  11
total reward:  6
total reward:  6
total reward:  9
total reward:  4
total reward:  5
total reward:  9
total reward:  7
total reward:  11
total reward:  9
total reward:  -5
total reward:  4
total reward:  9
total reward:  6
total reward:  8
total reward:  9
total reward:  2
total reward:  11
total reward:  7
total reward:  5
total reward:  7
total reward:  11
total reward:  9
total reward:  2
t

total reward:  9
total reward:  6
total reward:  7
total reward:  7
total reward:  10
total reward:  9
total reward:  -1
total reward:  11
total reward:  7
total reward:  10
total reward:  -3
total reward:  10
total reward:  10
total reward:  13
total reward:  5
total reward:  8
total reward:  9
total reward:  7
total reward:  -5
total reward:  10
total reward:  4
total reward:  10
total reward:  2
total reward:  9
total reward:  7
total reward:  10
total reward:  4
total reward:  -3
total reward:  10
total reward:  7
total reward:  9
total reward:  5
total reward:  14
total reward:  12
total reward:  4
total reward:  11
total reward:  4
total reward:  -15
total reward:  -3
total reward:  10
total reward:  -9
total reward:  12
total reward:  4
total reward:  9
total reward:  11
total reward:  5
total reward:  7
total reward:  3
total reward:  5
total reward:  6
total reward:  10
total reward:  4
total reward:  5
total reward:  8
total reward:  8
total reward:  8
total reward:  6
total 

total reward:  12
total reward:  6
total reward:  9
total reward:  6
total reward:  6
total reward:  6
total reward:  8
total reward:  12
total reward:  9
total reward:  11
total reward:  4
total reward:  10
total reward:  8
total reward:  5
total reward:  9
total reward:  -5
total reward:  11
total reward:  10
total reward:  6
total reward:  6
total reward:  7
total reward:  10
total reward:  5
total reward:  14
total reward:  11
total reward:  6
total reward:  7
total reward:  9
total reward:  3
total reward:  -6
total reward:  -3
total reward:  5
total reward:  10
total reward:  7
total reward:  -3
total reward:  6
total reward:  12
total reward:  11
total reward:  4
total reward:  5
total reward:  7
total reward:  12
total reward:  6
total reward:  8
total reward:  4
total reward:  10
total reward:  8
total reward:  8
total reward:  6
total reward:  9
total reward:  7
total reward:  9
total reward:  8
total reward:  9
total reward:  9
total reward:  6
total reward:  7
total reward:

total reward:  7
total reward:  1
total reward:  9
total reward:  10
total reward:  12
total reward:  5
total reward:  11
total reward:  5
total reward:  11
total reward:  8
total reward:  1
total reward:  6
total reward:  8
total reward:  -4
total reward:  6
total reward:  5
total reward:  8
total reward:  7
total reward:  6
total reward:  6
total reward:  9
total reward:  3
total reward:  7
total reward:  6
total reward:  5
total reward:  12
total reward:  -4
total reward:  8
total reward:  -1
total reward:  6
total reward:  8
total reward:  8
total reward:  7
total reward:  7
total reward:  8
total reward:  12
total reward:  11
total reward:  0
total reward:  15
total reward:  4
total reward:  10
total reward:  7
total reward:  8
total reward:  8
total reward:  7
total reward:  6
total reward:  11
total reward:  8
total reward:  -3
total reward:  9
total reward:  5
total reward:  9
total reward:  7
total reward:  12
total reward:  6
total reward:  11
total reward:  8
total reward:  

total reward:  10
total reward:  3
total reward:  11
total reward:  14
total reward:  4
total reward:  9
total reward:  10
total reward:  7
total reward:  7
total reward:  -3
total reward:  4
total reward:  6
total reward:  8
total reward:  11
total reward:  7
total reward:  5
total reward:  8
total reward:  8
total reward:  7
total reward:  8
total reward:  11
total reward:  8
total reward:  5
total reward:  6
total reward:  2
total reward:  9
total reward:  12
total reward:  12
total reward:  7
total reward:  4
total reward:  8
total reward:  7
total reward:  11
total reward:  10
total reward:  11
total reward:  -1
total reward:  6
total reward:  8
total reward:  9
total reward:  2
total reward:  4
total reward:  9
total reward:  9
total reward:  8
total reward:  3
total reward:  9
total reward:  14
total reward:  5
total reward:  9
total reward:  12
total reward:  10
total reward:  8
total reward:  4
total reward:  10
total reward:  6
total reward:  6
total reward:  9
total reward: 

total reward:  9
total reward:  11
total reward:  3
total reward:  12
total reward:  12
total reward:  -3
total reward:  10
total reward:  10
total reward:  8
total reward:  -7
total reward:  11
total reward:  -3
total reward:  5
total reward:  13
total reward:  8
total reward:  4
total reward:  10
total reward:  6
total reward:  9
total reward:  10
total reward:  11
total reward:  8
total reward:  6
total reward:  6
total reward:  -4
total reward:  8
total reward:  10
total reward:  5
total reward:  9
total reward:  6
total reward:  11
total reward:  4
total reward:  4
total reward:  9
total reward:  12
total reward:  5
total reward:  9
total reward:  10
total reward:  8
total reward:  7
total reward:  9
total reward:  6
total reward:  4
total reward:  5
total reward:  5
total reward:  7
total reward:  9
total reward:  12
total reward:  7
total reward:  5
total reward:  9
total reward:  8
total reward:  4
total reward:  -4
total reward:  8
total reward:  9
total reward:  11
total rewa

total reward:  12
total reward:  9
total reward:  11
total reward:  1
total reward:  10
total reward:  13
total reward:  9
total reward:  -1
total reward:  11
total reward:  8
total reward:  7
total reward:  7
total reward:  4
total reward:  9
total reward:  14
total reward:  12
total reward:  3
total reward:  7
total reward:  6
total reward:  3
total reward:  10
total reward:  3
total reward:  13
total reward:  12
total reward:  8
total reward:  5
total reward:  7
total reward:  3
total reward:  6
total reward:  6
total reward:  10
total reward:  6
total reward:  12
total reward:  6
total reward:  7
total reward:  -6
total reward:  8
total reward:  8
total reward:  -3
total reward:  7
total reward:  10
total reward:  7
total reward:  7
total reward:  -2
total reward:  7
total reward:  8
total reward:  7
total reward:  9
total reward:  10
total reward:  12
total reward:  5
total reward:  7
total reward:  9
total reward:  8
total reward:  -3
total reward:  4
total reward:  8
total rewar

## 8. Use the learned Q-table to see the optimal policy

In [36]:
state = env.reset()
total_reward = 0

while True:
    # select greedy action because now the policy is optimal
    action = max(list(range(env.action_space.n)), key = lambda x: q[(state, x)])
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
    env.render()
    
    state, reward, done, _ = env.step(action)
    total_reward += reward
    print('Episode Reward= ', total_reward)
    time.sleep(0.5)
    
    if done:
        break

env.close()

<matplotlib.figure.Figure at 0x7f68200b2198>

+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
Episode Reward=  8


<matplotlib.figure.Figure at 0x7f68200b2198>