<a href="https://colab.research.google.com/github/MD03/RLT/blob/main/RLT_Bellman_Optimality_Equations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Bellman Opitmality Equations
import numpy as np

class MDP:
    def __init__(self, num_states, num_actions, transition_probs, rewards, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.gamma = gamma

    def value_iteration(self, tol=1e-6):
        V = np.zeros(self.num_states)
        while True:
            delta = 0
            for s in range(self.num_states):
                v = V[s]
                # Bellman optimality equation: V(s) = max_a sum_s' P(s'|s,a) * [R(s,a,s') + gamma * V(s')]
                max_value = float('-inf')
                for a in range(self.num_actions):
                    action_value = sum(self.transition_probs[s, a, s_prime] *
                                       (self.rewards[s, a] + self.gamma * V[s_prime])
                                       for s_prime in range(self.num_states))
                    max_value = max(max_value, action_value)
                V[s] = max_value
                delta = max(delta, abs(v - V[s]))
            if delta < tol:
                break
        return V

# Example usage
num_states = 3
num_actions = 2
transition_probs = np.array([[[0.5, 0.5, 0.0], [1.0, 0.0, 0.0]],
                             [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]],
                             [[0.0, 1.0, 0.0], [0.5, 0.5, 0.0]]])
rewards = np.array([[1.0, 2.0], [0.0, 0.0], [5.0, -1.0]])
mdp = MDP(num_states, num_actions, transition_probs, rewards)
V_optimal = mdp.value_iteration()
print("Optimal value function:", V_optimal)