# Question
Lab 4: Develop a program to perform policy evaluation and improvement for a given MDP

In [None]:
# Libraries
import numpy as np


In [None]:
class MDP:
    def __init__(self, states, actions, transition_prob, rewards, gamma=0.9, theta=1e-5):
        self.states = states
        self.actions = actions
        self.transition_prob = transition_prob
        self.rewards = rewards
        self.gamma = gamma
        self.theta = theta
        self.policy = {s: np.random.choice(actions) for s in states}
        self.value = {s: 0 for s in states}

    def policy_evaluation(self):
        while True:
            delta = 0
            for s in self.states:
                v = self.value[s]
                action = self.policy[s]
                self.value[s] = sum(self.transition_prob[s][action][s_next] *
                                    (self.rewards[s][action][s_next] +
                                     self.gamma * self.value[s_next])
                                    for s_next in self.states)
                delta = max(delta, abs(v - self.value[s]))
            if delta < self.theta:
                break

    def policy_improvement(self):
        policy_stable = True
        for s in self.states:
            old_action = self.policy[s]
            action_values = {a: sum(self.transition_prob[s][a][s_next] *
                                    (self.rewards[s][a][s_next] +
                                     self.gamma * self.value[s_next])
                                    for s_next in self.states)
                             for a in self.actions}
            self.policy[s] = max(action_values, key=action_values.get)
            if old_action != self.policy[s]:
                policy_stable = False
        return policy_stable

    def policy_iteration(self):
        while True:
            self.policy_evaluation()
            if self.policy_improvement():
                break
        return self.policy, self.value

# Example usage:
if __name__ == "__main__":
    states = ['S1', 'S2', 'S3']
    actions = ['A1', 'A2']

    # Define transition probabilities for each action at each state
    transition_prob = {
        'S1': {
            'A1': {'S1': 0.5, 'S2': 0.5, 'S3': 0.0},
            'A2': {'S1': 0.0, 'S2': 1.0, 'S3': 0.0}
        },
        'S2': {
            'A1': {'S1': 0.0, 'S2': 0.7, 'S3': 0.3},
            'A2': {'S1': 0.4, 'S2': 0.0, 'S3': 0.6}
        },
        'S3': {
            'A1': {'S1': 0.0, 'S2': 0.0, 'S3': 1.0},
            'A2': {'S1': 0.0, 'S2': 0.0, 'S3': 1.0}
        }
    }

    # Define rewards for each state-action-next_state combination
    rewards = {
        'S1': {
            'A1': {'S1': 0, 'S2': 1, 'S3': 0},
            'A2': {'S1': 0, 'S2': 1, 'S3': 0}
        },
        'S2': {
            'A1': {'S1': 0, 'S2': 0, 'S3': 1},
            'A2': {'S1': 0, 'S2': 0, 'S3': 2}
        },
        'S3': {
            'A1': {'S1': 0, 'S2': 0, 'S3': 0},
            'A2': {'S1': 0, 'S2': 0, 'S3': 0}
        }
    }

    mdp = MDP(states, actions, transition_prob, rewards)
    policy, value = mdp.policy_iteration()

    print("Optimal Policy:", policy)
    print("Value Function:", value)




Optimal Policy: {'S1': 'A2', 'S2': 'A2', 'S3': 'A1'}
Value Function: {'S1': 3.076921274023741, 'S2': 2.307691658648547, 'S3': 0.0}
