In [None]:
pip install gym

In [1]:
#import libraries
import gym # openAi gym
from gym import envs
import numpy as np 
import datetime

import matplotlib.pyplot as plt

from IPython.display import clear_output
from matplotlib import animation
from IPython.display import display

from time import sleep

import warnings
warnings.filterwarnings('ignore')


In [21]:
#functions
#this function will be called back in PI function
def evaluation(policy, env, DF=1.0, theta=0.00001):
 
    # Start with a random (all 0) value function
    V = np.zeros(env.env.nS)
    while True:
        
        delta = 0  #change in value of state from one iteration to next
       
        for state in range(env.env.nS):  #for all states
            value = 0  #initiate value as 0
            
            for action,act_prob in enumerate(policy[state]): #for all actions/action probabilities
                for P,nextstate,reward,done in env.env.P[state][action]:  #transition probabilities,state,rewards of each action
                    value += act_prob * P * (reward + DF * V[nextstate])  #eqn to calculate
            delta = max(delta, np.abs(value-V[state]))
            V[state] = value
        if delta < theta:  #break if the change in value is less than the threshold (theta)
            break
    return np.array(V)

In [22]:
#PI 
def policy_iteration(env, evaluation, DF=1.0):
   
    
    def prob_nextstate(state, V):
       
        A = np.zeros(env.env.nA)
        for a in range(env.env.nA):
            for P, nextstate, reward, done in env.env.P[state][a]:
                A[a] += P * (reward + DF* V[nextstate])
        return A
    
    # Start with a random policy
    policy = np.ones([env.env.nS, env.env.nA]) / env.env.nA
    
    while True:
      
        evaluate = evaluation(policy, env, DF)  #eval current policy
        policy_stable = True  #Check if policy did improve (Set it as True first)
        for state in range(env.env.nS):  #for each states
            chosen_act = np.argmax(policy[state])  #best action (Highest prob) under current policy
            act_values = prob_nextstate(state,evaluate)  #use one step lookahead to find action values
            best_act = np.argmax(act_values) #find best action
            if chosen_act != best_act:
                policy_stable = False  #Greedily find best action
            policy[state] = np.eye(env.env.nA)[best_act]  #update 
        if policy_stable:
            return policy, evaluate
    
    return policy, np.zeros(env.env.nS)

In [23]:
#initialize set up
env = gym.make('Taxi-v3')

In [26]:
PI = policy_iteration(env,evaluation,DF=0.99)
#animate PI
def Animation(policy):
    penalties, reward = 0, 0
    frames = [] # for animation
    done = False
    curr_state = env.reset()
    while not done:
        action = np.argmax(policy[0][curr_state]) 
        state, reward, done, info = env.step(action)
        curr_state = state
        if reward == -10:
            penalties += 1
        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
    def printframes(frames):
        for i, frame in enumerate(frames):
            clear_output(wait=True)
            print(frame['frame'])
            print(f"Timestep: {i + 1}")
            print(f"State: {frame['state']}")
            print(f"Action: {frame['action']}")
            print(f"Reward: {frame['reward']}")
            sleep(.2)
    printframes(frames)

In [28]:
#run code
Animation(PI)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 12
State: 475
Action: 5
Reward: 20
