# TSFS12 Hand-in exercise 5: Learning for autonomous vehicles - Reinforcement learning and value iterations

Initial imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from rl_auxiliary import V_update, plot_iter, BoxOff

In [None]:
%matplotlib

# Parameters in value iterations

In [None]:
gamma = 0.99
R_goal = 0.0 # Reward for reaching goal state
R_sink = -10.0  # Reward for reaching 'cliff' states
R_grid = -0.1 # Reward for remaining states

P_move_action = 0.99  # probability of moving in the direction specified by action
P_dist = (1-P_move_action)/2  # probability of moving sideways compared to intended because of disturbance

n_rows = 4
n_cols = 5

# Set start and goal/sink positions.
goal = np.array([3, 4])  # element index goal state
sink = np.array([[3, 1], [3, 2], [3, 3]])  # element indices for cliff states

# Setup reward matrix R
R = np.full((n_rows, n_cols), fill_value=R_grid)
R[goal[0], goal[1]] = R_goal
R[sink[:, 0], sink[:, 1]] = R_sink

occ_grid = np.zeros((n_rows, n_cols))
occ_grid[1, 1] = 1

# Save parameters in a struct
params = {'gamma': gamma, 'R_goal': R_goal, 'R_sink': R_sink, 
          'R_grid': R_grid, 'P_move_action': P_move_action, 
          'P_dist': P_dist, 'n_rows': n_rows, 'n_cols': n_cols, 
          'goal': goal, 'sink': sink, 'R': R, 'occ_grid': occ_grid}

# Main loop for value iterations

Main loop for value iterations. Press return to proceed to next iteration. Note that plots must be made in an external window, not inline, since value function and policy is updated in the `plot_iter` function call.

In [None]:
# Main loop for value iteration
# Algorithm according to Section 4.4 in Sutton, R. S., & A. G. Barto: 
# Reinforcement learning: An introduction. MIT Press, 2018.

# Initilaize value function for each state
V = np.zeros((n_rows,n_cols))

# Actions - ['left','right','up','down'] counted as 0-3

# Initialize vector for policy
Pi = np.full((n_rows, n_cols), fill_value=-1)
converged = False

while not converged:    
    Delta = 0

    for row in range(n_rows):
        for col in range(n_cols):
            if ((occ_grid[row, col] == 1) or 
               np.all([row, col]==goal) or
               np.any(np.logical_and(row==sink[:, 0], col==sink[:, 1]))):
                continue
            v = V[row, col]
            V[row, col], max_a = V_update(row, col, V, params)
            Pi[row, col] = max_a
            Delta = np.max((Delta, np.abs(v-V[row, col])))
    
    # Visualize current value function and associated actions according to
    # current policy
    plot_iter(V, Pi, params)
    print(V)
    print(Pi)
    print('Press enter')
    _ = input()
    
    if Delta < 1e-6:
        converged = True
print('Convergence!')