### OpenAI CliffWalking-v0 (Value Iteration and Policy Iteration)

This 

#### Importing Helper Libraries 

In [None]:
# import gymnasium 
# it generates openAI environment for RL Agent
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [None]:
environment = 'CliffWalking-v0'
renderMode = 'rgb_array'
# action mapping 
actions = {0:'UP', 1:'RIGHT', 2:'DOWN', 3:'LEFT'}

##### Agent

A generalised agent class that acts as parent class to the multiple agent classes, that act and behave differently in the environment.

In [None]:
class Agent:

    def __init__(self) -> None:
        # this initialises the environment
        self.env = gym.make(environment,
                            render_mode = renderMode)
        self.env.reset()

        self.state_values = np.zeros(self.env.observation_space.n) # initialise the state values to zero
        self.discountFactor = 1 # discount factor

    

##### Confused Agent

This agent is made such that, it selects the actions from the given action space, randomly.

In [None]:
class ConfusedAgent(Agent):
    # This agent picks up random actions 
    # Based on the action space 
    def __init__(self) -> None:
        super().__init__()
        self.iterations = 10000
        action = self.env.action_space.sample()
        return action
    
    def act(self):
        
        self.env = gym.make(environment,render_mode = renderMode)
        observation, info = self.env.reset()

        for _ in range(self.iterations):
            action = self.policy(self.env)
            observation, reward, terminated, truncated,info = self.env.step(action)

            if terminated:
                observation, info  = self.env.reset()
        self.env.close()    


##### Policy Iteration

In [None]:
class PolicyIterationAgent(Agent):

    def __inpassit__(self) -> None:
        
        super().__init__() # initialise the Agent class
        
    def PolicyEvaluation(self,policy,threshold = 1e-5)->bool:
        
        env = self.env               # initialise the environment variable
        n = env.observation_space.n  # the number of states 
        state_values = self.state_values 
        discountFactor = self.discountFactor
        delta = 0   
        valueConverged = False
        updatedStateValues = state_values.copy()

        # iterate over all the states 
        for state in range(n-1):
            # do not iterate over the goal state
            # episode terminates at the goal state
            stateValue = 0
            action = policy[state] # the policy is discrete, get the action
            transitionProbabilities = env.P[state][action] # get the transition probabilities for the given state and action

            # expected return 
            for probability, nextState, reward, done in transitionProbabilities:
                expected_return =  probability *(reward + discountFactor * updatedStateValues[nextState])
                stateValue += expected_return
            
            delta = max(delta, abs(stateValue - updatedStateValues[state])) # update the delta
            updatedStateValues[state] = stateValue

        if delta < threshold:
            valueConverged = True          
        
        self.state_values = updatedStateValues

        return valueConverged
    
    def PolicyImprovement(self,policy)->bool:

        stop = True
        env = self.env
        n = env.observation_space.n
        discountFactor = self.discountFactor
        state_values = self.state_values

        for state in range(n-1):
            bestAction = 0
            bestValue = -np.inf

            for a in range(env.action_space.n):
                stateValue = 0
                transitionProbabilities = env.P[state][a]
                for probability, nextState, reward, done in transitionProbabilities:
                    expected_return =  probability *(reward + discountFactor * state_values[nextState])
                    stateValue += expected_return
                
                if stateValue > bestValue:
                    bestValue = stateValue
                    bestAction = a

            if policy[state] != bestAction:
                stop = False
                policy[state] = bestAction
        
        return stop



    def Run(self):

        n = self.env.observation_space.n # the number of states
        policy = np.zeros(n,dtype = int)

        stop = False
        iter = 0
        while not stop:
            # iter += 1
            # print(f'Iteration: {iter}') 
            # run Policy Evaluation           
            ValueConverged = False
            i = 0         
            while not ValueConverged:
                i += 1
                ValueConverged = self.PolicyEvaluation(policy)
                # print(f'Policy Evaluation: {i}')
            stop = self.PolicyImprovement(policy)
        
        return policy, self.state_values

In [None]:
policyIterationAgent = PolicyIterationAgent()

policy, state_values = policyIterationAgent.Run()

print('Policy: \n',policy.reshape(4,12))
print('State Values: ',state_values.reshape(4,12))

##### Value Iteration

In [None]:
class ValueIteration(Agent):
    
    def __init__(self) -> None:
        super().__init__()
        self.policy = np.zeros(self.env.observation_space.n,dtype = int)

    
    def ValueIteration(self,threshold = 1e-5)->bool:

        state_values = self.state_values

        n = self.env.observation_space.n

        stop = True
        delta = 0
        for state in range(n-1):
            bestValue = -np.inf
            
            for action in range(self.env.action_space.n):
                stateValue = 0
                transitionProbabilities = self.env.P[state][action]
                for probability, nextState, reward, done in transitionProbabilities:
                    expected_return =  probability *(reward + self.discountFactor * state_values[nextState])
                    stateValue += expected_return
                
                if stateValue > bestValue:
                    bestValue = stateValue
                    self.policy[state] = action
            
            delta = max(delta, abs(bestValue - state_values[state]))
            state_values[state] = bestValue
        
        self.state_values = state_values

        return delta < threshold

    def Run(self):

        stop = False

        while not stop:
            stop = self.ValueIteration()
        
        return self.policy, self.state_values

In [None]:
valueIteration = ValueIteration()
policy, state_values = valueIteration.Run()

print('Policy: \n')
policy = policy.reshape(4,12)
policy = np.where(policy == 0, 'U', policy)
policy = np.where(policy == '1', 'R', policy)
policy = np.where(policy == '2', 'D', policy)
policy = np.where(policy == '3', 'L', policy)
print(policy)
print('State Values: \n',state_values.reshape(4,12))