In [1]:
pip install gym

Note: you may need to restart the kernel to use updated packages.


In [14]:
# Samuel Price
# 02/23/2021
# RL&C HW #1

import gym
import matplotlib.pyplot as plt
import numpy as np
import os

# Modified Functions written by allenbreyes
# Repo: https://github.com/allanbreyes/gym-solutions/blob/master/analysis/mdp.py

# Load in Gym Taxi
taxi = gym.make('Taxi-v3')

# Create Mapping for Taxi Problem
mapping = {0: "S", 1: "N", 2: "E", 3: "W", 4: "P", 5: "D"}

# Get the total number of stages and actions from the environment
s_count = taxi.observation_space.n
a_count = taxi.action_space.n

# Set intial policy using the sample policy from the environment instead of a completely blank slate
policy = np.array([taxi.action_space.sample() for i in range(s_count)])

# Set value function to all zeros for each stage
v_function = np.zeros(s_count)

# Get the Rewards and Transitions from the Environment
Rewards = np.zeros((s_count, a_count, s_count))
Transitions = np.zeros((s_count, a_count, s_count))

# Set each reward and transition probability value based on the given environment
for state in range(0,s_count):
    for action in range(0,a_count):
        for transition in taxi.env.P[state][action]:
            prob, next_state, reward, not_used = transition
            Rewards[state, action, next_state] = reward
            Transitions[state, action, next_state] = prob
        
# Perform Policy Iteration
# Set Maximum iterations to 1000
# Used a delta of 0.0001 and gamma of .8

# Policy Iteration Loop
for i in range(1,1000):
    previous_policy = policy.copy()
    
    # Value Function Iteration Loop
    for j in range(1, 1000):
        
        previous_v_function = v_function.copy()

        # Perform Eigen Summation to update value function
        sum = np.einsum('ijk,ijk -> ij', Transitions, Rewards + .8*v_function)

        # Increase Dimensionality of the policy to be (s_count, a_count) from (s_count,)
        reshaped_policy = np.zeros((s_count, a_count))
        reshaped_policy[np.arange(s_count), policy] = 1

        # Calculate new value function for the current policy
        v_function = np.sum(reshaped_policy * sum, 1)
        
        # Evaluate the new value function
        if np.max(np.abs(v_function - previous_v_function)) < 0.001:
            break
    
    # Calculate the new Policy
    sum = np.einsum('ijk,ijk -> ij', Transitions, Rewards + .8*v_function)
    
    # I found that using argmax here allowed for the best policy to be outputted
    policy = np.argmax(sum, 1)
    
    # Evaluate the Policy by comparing it to the previous policy  
    if np.array_equal(policy, previous_policy):
        break

# Display Results

print("Optimal Policy: ")

# Convert numeric values into action values: N, S, E, W, P, and D
print(np.array([mapping[action] for action in policy]))
print("Number of Policy Iterations: " + str(i))

print("Optimal Value Function: ")
print(v_function)

Optimal Policy: 
['P' 'P' 'P' 'P' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'D' 'S'
 'S' 'S' 'W' 'W' 'W' 'W' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'W' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'E' 'E' 'E' 'E' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'E' 'S' 'S' 'S' 'S' 'S' 'S' 'E' 'E' 'E' 'E' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'E' 'S' 'S' 'S' 'S' 'S' 'S' 'P' 'P' 'P' 'P' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'D' 'S' 'S' 'N' 'N' 'N' 'N' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'N' 'S' 'S' 'S' 'N' 'N' 'N' 'N' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'N' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'N' 'N' 'N' 'N' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'N' 'S' 'S' 'S' 'S'
 'S' 'S' 'N' 'N' 'N' 'N' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'N' 'S' 'S'
 'S' 'S' 'S' 'S' 'N' 'N' 'N' 'N' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'N'
 'S' 'S' 'N' 'N' 'N' 'N' 'E' 'E' 'E' 'E' 'S' 'S' 'S' 'S' 'E' 'E' 'E' 'E'
 'N' 'E' 'S' 'E' 'N' 'N' 'N' 'N' 'E' 'E' 'E' 'E' 'W' 'W' 'W' 'W' 'E' 'E'
 'E' 'E' 'N' 'E' 'W' 'E' 'W' 'W' '