<a href="https://colab.research.google.com/github/Imran-co/Machine-Intelligence--2-/blob/main/lab_cycle2_q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Write a Python program to model a Markov Decision Process (MDP) with 3
states and 2 actions, using a predefined transition probability matrix and reward
function. Initialize a uniform stochastic policy where each action has a
probability of 0.5 in every state. Compute the value function for each state under
this policy (policy evaluation) using the Bellman expectation equation, with a
discount factor of 0.9. Provide a sample MDP with explicit transition and reward
values, and demonstrate the calculated value function.

In [None]:
import numpy as np

class MDP:
  def __init__(self, states, actions, rewards, transition, gamma =0.9):
    self.states = states
    self.actions = actions
    self.rewards = rewards
    self.transition = transition
    self.gamma = gamma
  def policy_evaluation(self, policy, theta=1e-6, max_iterations=1000):
    Vf = {s: 0 for s in self.states}
    for i in range(max_iterations):
      delta = 0
      for s in self.states:
        v = Vf[s]
        new_v = 0
        for a in self.actions:
          action_prob = policy[s][a]
          for s_ in self.states:
            new_v += action_prob * self.transition[(s, a, s_)] * (self.rewards[(s, a, s_)] + self.gamma * Vf[s_])

          Vf[s] = new_v
          delta = max(delta, abs(v - Vf[s]))

        if delta < theta:
          break

      return Vf

states = ['s1', 's2', 's3']
actions = ['a1', 'a2']


transitions = {
    # From s1
    ('s1', 'a1', 's1'): 0.2, ('s1', 'a1', 's2'): 0.7, ('s1', 'a1', 's3'): 0.1,
    ('s1', 'a2', 's1'): 0.6, ('s1', 'a2', 's2'): 0.3, ('s1', 'a2', 's3'): 0.1,

    # From s2
    ('s2', 'a1', 's1'): 0.1, ('s2', 'a1', 's2'): 0.8, ('s2', 'a1', 's3'): 0.1,
    ('s2', 'a2', 's1'): 0.3, ('s2', 'a2', 's2'): 0.4, ('s2', 'a2', 's3'): 0.3,

    # From s3
    ('s3', 'a1', 's1'): 0.4, ('s3', 'a1', 's2'): 0.4, ('s3', 'a1', 's3'): 0.2,
    ('s3', 'a2', 's1'): 0.1, ('s3', 'a2', 's2'): 0.1, ('s3', 'a2', 's3'): 0.8,
}


rewards = {
    # From s1
    ('s1', 'a1', 's1'): 1, ('s1', 'a1', 's2'): 2, ('s1', 'a1', 's3'): -1,
    ('s1', 'a2', 's1'): 0, ('s1', 'a2', 's2'): 1, ('s1', 'a2', 's3'): 1,

    # From s2
    ('s2', 'a1', 's1'): 1, ('s2', 'a1', 's2'): 3, ('s2', 'a1', 's3'): 0,
    ('s2', 'a2', 's1'): -1, ('s2', 'a2', 's2'): 2, ('s2', 'a2', 's3'): 1,

    # From s3
    ('s3', 'a1', 's1'): 3, ('s3', 'a1', 's2'): 1, ('s3', 'a1', 's3'): 0,
    ('s3', 'a2', 's1'): -2, ('s3', 'a2', 's2'): 0, ('s3', 'a2', 's3'): 2,
}

mdp = MDP(states, actions, transitions, rewards, gamma=0.9)

uniform_policy = {
    's1': {'a1': 0.5, 'a2': 0.5},
    's2': {'a1': 0.5, 'a2': 0.5},
    's3': {'a1': 0.5, 'a2': 0.5}
}

vf = mdp.policy_evaluation(uniform_policy)

In [None]:
print("MDP with 3 states and 2 actions:")
print("States:", states)
print("Actions:", actions)

MDP with 3 states and 2 actions:
States: ['s1', 's2', 's3']
Actions: ['a1', 'a2']


In [None]:
print("P(s2|s1,a1) =", transitions[('s1', 'a1', 's2')])
print("P(s3|s2,a2) =", transitions[('s2', 'a2', 's3')])

P(s2|s1,a1) = 0.7
P(s3|s2,a2) = 0.3


In [None]:
print("R(s1,a1,s2) =", rewards[('s1', 'a1', 's2')])
print("R(s3,a2,s1) =", rewards[('s3', 'a2', 's1')])

R(s1,a1,s2) = 2
R(s3,a2,s1) = -2


In [None]:
for state, action_probs in uniform_policy.items():
    print(f"{state}: {action_probs}")

s1: {'a1': 0.5, 'a2': 0.5}
s2: {'a1': 0.5, 'a2': 0.5}
s3: {'a1': 0.5, 'a2': 0.5}


In [None]:
for state, value in vf.items():
    print(f"V({state}) = {value:.3f}")

V(s1) = 0.950
V(s2) = 3.160
V(s3) = 6.503
