<a href="https://colab.research.google.com/github/MD03/RLT/blob/main/RLT_DP_PE_and_PI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#DP- PE and PI
import numpy as np
import matplotlib.pyplot as plt

class MDP:
    def __init__(self, num_states, num_actions, transition_probs, rewards, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.gamma = gamma

    def policy_evaluation(self, policy, tol=1e-6):
        V = np.zeros(self.num_states)
        delta_values = []
        while True:
            delta = 0
            for s in range(self.num_states):
                v = V[s]
                action = policy[s]
                V[s] = sum(self.transition_probs[s, action, s_prime] *
                           (self.rewards[s, action] + self.gamma * V[s_prime])
                           for s_prime in range(self.num_states))
                delta = max(delta, abs(v - V[s]))
            delta_values.append(delta)
            if delta < tol:
                break
        return V, delta_values

    def policy_iteration(self):
        policy = np.zeros(self.num_states, dtype=int)
        while True:
            V, _ = self.policy_evaluation(policy)
            policy_stable = True
            for s in range(self.num_states):
                old_action = policy[s]
                policy[s] = np.argmax(self._bellman_operator(s, V))
                if old_action != policy[s]:
                    policy_stable = False
            if policy_stable:
                break
        return policy

    def _bellman_operator(self, state, V):
        Q = np.zeros(self.num_actions)
        for a in range(self.num_actions):
            for s_prime in range(self.num_states):
                Q[a] += self.transition_probs[state, a, s_prime] * (self.rewards[state, a] + self.gamma * V[s_prime])
        return Q

num_states = 3
num_actions = 2
transition_probs = np.array([[[0.5, 0.5, 0.0], [1.0, 0.0, 0.0]], [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]], [[0.0, 1.0, 0.0], [0.5, 0.5, 0.0]]])
rewards = np.array([[1.0, 2.0], [0.0, 0.0], [5.0, -1.0]])
mdp = MDP(num_states, num_actions, transition_probs, rewards)
policy = mdp.policy_iteration()

V, delta_values = mdp.policy_evaluation(policy)
plt.plot(delta_values)
plt.xlabel('Iterations')
plt.ylabel('Delta (Convergence)')
plt.title('Convergence of Value Function during Policy Evaluation')
plt.show()