# Installing the Relevant Libraries

In [None]:
!pip install gym



In [2]:
!pip install tools

Collecting tools
  Downloading tools-0.1.9.tar.gz (34 kB)
Collecting pytils
  Downloading pytils-0.4.1.tar.gz (99 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Building wheels for collected packages: tools, pytils
  Building wheel for tools (setup.py): started
  Building wheel for tools (setup.py): finished with status 'done'
  Created wheel for tools: filename=tools-0.1.9-py3-none-any.whl size=46756 sha256=856bd06f53080249ce4e591abe6cf65cf6d33e87813bf040442c1fadfd61d49e
  Stored in directory: c:\users\mitta\appdata\local\pip\cache\wheels\77\13\56\879617e6017b5dde1711ca8c9ee5e9838e913c85ae22fd96c9
  Building wheel for pytils (PEP 517): started
  Building wheel for pytils (PEP 517): finished with status 'done'
  Created w

In [5]:
import numpy as np
import tools
import gym
env=gym.make('FrozenLake-v1', desc=None, map_name="8x8", is_slippery = False,
             render_mode="human")

Helper Functions required to execute Policy Iteration

In [21]:
##################
# this function computes the state values after following a certain policy
##################

def getValueFunction(env,valueFnc,policy,discount,maxIterations,convergenceTolerance):
    import numpy as np
    for iterations in range(maxIterations):
        valueFncNextIteration=np.zeros(env.observation_space.n)
        for state in env.P:
            outerSum=0
            for action in env.P[state]:
                innerSum=0
                for p, sPrime, r, isTerminalState in env.P[state][action]:
                    #print(p, sPrime, r, isTerminalState)
                    innerSum=innerSum+ p*(r+discount*valueFnc[sPrime])
                outerSum=outerSum+policy[state,action]*innerSum
            valueFncNextIteration[state]=outerSum
        if(np.max(np.abs(valueFncNextIteration-valueFnc))<convergenceTolerance):
            valueFnc=valueFncNextIteration
            print('Iterative policy evaluation algorithm converged!')
            break
        valueFnc=valueFncNextIteration       
    return valueFnc

##################
# this function computes an improved policy 
##################

def improvePolicy(env,valueFnc,numberActions,numberStates,discount):
    import numpy as np
    # this matrix will store the action value functions for every state
    qvalues=np.zeros((numberStates,numberActions))
    # this is the improved policy
    improvedPolicy=np.zeros((numberStates,numberActions))
    
    for state in range(numberStates):
      
        for action in range(numberActions):
         
            for probability, nextState, reward, isTerminalState in env.P[state][action]:
                qvalues[state,action]=qvalues[state,action]+\
                probability*(reward+discount*valueFnc[nextState])
            
      
        bestaction=np.where(qvalues[state,:]==np.max(qvalues[state,:]))

              
        improvedPolicy[state,bestaction]=1/np.size(bestaction)
    return improvedPolicy,qvalues



Defining our learning parameters and starting the Policy Iteration Algorithm

In [25]:
discount=0.9

stateNumber=env.observation_space.n

actionNumber=env.action_space.n

maxPolicyIteration=1000

initialPolicy=(1/actionNumber)*np.ones((stateNumber,actionNumber))

valueFncInitial=np.zeros(env.observation_space.n)

maxPolicyEvaluation=1000

converganceTolerance=10**(-6)
###########################################################################

for iteration in range(maxPolicyIteration):
    print("Iteration - {} - of policy iteration algorithm".format(iteration))
    if (iteration == 0):
        currentPolicy=initialPolicy
    valueFnccomputed =getValueFunction(env,valueFncInitial,currentPolicy,
                                       discount,maxPolicyEvaluation,converganceTolerance)
    print(currentPolicy, valueFnccomputed)
    improvedPolicy,qvalues=improvePolicy(env,valueFnccomputed,
                                         actionNumber,stateNumber,discount)
 
    # if two policies are similar up to a certain "small" tolerance
    if np.allclose(currentPolicy,improvedPolicy):
        currentPolicy=improvedPolicy
        print("Policy iteration algorithm converged!")
        break
    currentPolicy=improvedPolicy

Iteration - 0 - of policy iteration algorithm
Iterative policy evaluation algorithm converged!
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0

Code to convert our environment to an MDP

In [22]:
def convert_frozenlake_to_mdp(env):
    # Extract the necessary information from the environment
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    transition_matrix = np.zeros((num_states, num_actions, num_states))
    reward_matrix = np.zeros((num_states, num_actions))

    # Build the transition and reward matrices
    for state in range(num_states):
        for action in range(num_actions):
            for transition in env.P[state][action]:
                next_state, probability, reward, _ = transition
                transition_matrix[state, action, next_state] = probability
                reward_matrix[state, action] = reward
    
    
    mdp = MDP(transition_matrix, reward_matrix, initial_state='0')

Trying to make mdp to work

In [29]:
 from google.colab import drive
 import sys
 drive.mount('/content/drive')

  self._read_thread.setDaemon(True)


Mounted at /content/drive


In [33]:
sys.path.insert(0,'/content/drive/MyDrive/Colab Notebooks/')
import mdp as mdp_file
print(mdp_file.__file__)

/usr/local/lib/python3.10/dist-packages/mdp/__init__.py


In [31]:
from mdp import MDP

ImportError: ignored

In [32]:
!pip install mdp


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
from mdp import MDP

convert_frozenlake_to_mdp(env)

ImportError: ignored