<a href="https://colab.research.google.com/github/KotlaGeethika/pai/blob/main/internal2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# MDP
import numpy as np
class MDP:
    def __init__(self, states, actions, transition_probs, rewards, gamma=0.9):
        self.states = states
        self.actions = actions
        self.P = transition_probs
        self.R = rewards
        self.gamma = gamma
    def policy_iteration(self):
        policy = np.zeros(len(self.states), dtype=int)
        V = np.zeros(len(self.states))
        while True:
            while True:
                delta = 0
                for s in self.states:
                    v = V[s]
                    a = policy[s]
                    V[s] = sum(self.P[s][a][s1] * (self.R[s][a] + self.gamma * V[s1])
                               for s1 in self.states)
                    delta = max(delta, abs(v - V[s]))
                if delta < 1e-6:
                    break
            policy_stable = True
            for s in self.states:
                old_action = policy[s]
                action_values = []
                for a in self.actions:
                    action_value = sum(self.P[s][a][s1] * (self.R[s][a] + self.gamma * V[s1])
                                       for s1 in self.states)
                    action_values.append(action_value)
                policy[s] = np.argmax(action_values)
                if old_action != policy[s]:
                    policy_stable = False
            if policy_stable:
                break
        return policy, V
states = [0, 1, 2]
actions = [0, 1]
transition_probs = {
    0: {
        0: {0: 0.8, 1: 0.2, 2: 0.0},
        1: {0: 0.1, 1: 0.0, 2: 0.9}
    },
    1: {
        0: {0: 0.7, 1: 0.3, 2: 0.0},
        1: {0: 0.2, 1: 0.0, 2: 0.8}
    },
    2: {
        0: {0: 0.9, 1: 0.1, 2: 0.0},
        1: {0: 0.3, 1: 0.0, 2: 0.7}
    }
}
rewards = {
    0: {0: 5, 1: 10},
    1: {0: 0, 1: 1},
    2: {0: 0, 1: 5}
}
mdp = MDP(states, actions, transition_probs, rewards)
policy, V_pi = mdp.policy_iteration()
print("\nPolicy Iteration:")
for s in states:
    print(f"  π({s}) = {policy[s]}, V({s}) = {V_pi[s]:.4f}")


Policy Iteration:
  π(0) = 1, V(0) = 65.6780
  π(1) = 1, V(1) = 57.0593
  π(2) = 1, V(2) = 61.4407


In [None]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_probs, rewards, gamma=0.9, theta=1e-6):
        self.states = states
        self.actions = actions
        self.P = transition_probs  # dict: P[s][a][s'] = prob
        self.R = rewards           # dict: R[s][a] = reward
        self.gamma = gamma
        self.theta = theta

    def _evaluate_policy(self, policy):
        V = np.zeros(len(self.states))
        while True:
            delta = 0
            for s in self.states:
                a = policy[s]
                v = V[s]
                V[s] = sum(
                    self.P[s][a].get(s1, 0) * (self.R[s].get(a, 0) + self.gamma * V[s1])
                    for s1 in self.states
                )
                delta = max(delta, abs(v - V[s]))
            if delta < self.theta:
                break
        return V

    def _improve_policy(self, V, policy):
        policy_stable = True
        for s in self.states:
            old_action = policy[s]
            action_values = []
            for a in self.actions:
                value = sum(
                    self.P[s][a].get(s1, 0) * (self.R[s].get(a, 0) + self.gamma * V[s1])
                    for s1 in self.states
                )
                action_values.append(value)
            best_action = np.argmax(action_values)
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False
        return policy_stable

    def policy_iteration(self):
        policy = np.zeros(len(self.states), dtype=int)
        while True:
            V = self._evaluate_policy(policy)
            if self._improve_policy(V, policy):
                break
        return policy, V

# Example usage:
states = list(range(3))
actions = list(range(2))
transition_probs = {
    0: {0: {0: 0.8, 1: 0.2}, 1: {0: 0.1, 2: 0.9}},
    1: {0: {0: 0.7, 1: 0.3}, 1: {0: 0.2, 2: 0.8}},
    2: {0: {0: 0.9, 1: 0.1}, 1: {0: 0.3, 2: 0.7}},
}
rewards = {
    0: {0: 5, 1: 10},
    1: {0: 0, 1: 1},
    2: {0: 0, 1: 5},
}

mdp = MDP(states, actions, transition_probs, rewards)
policy, V = mdp.policy_iteration()

print("\nPolicy Iteration Result:")
for s in states:
    print(f"  π({s}) = {policy[s]}, V({s}) = {V[s]:.4f}")



Policy Iteration Result:
  π(0) = 1, V(0) = 65.6780
  π(1) = 1, V(1) = 57.0593
  π(2) = 1, V(2) = 61.4407


In [None]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_probs, rewards=None, gamma=0.9, theta=1e-6):
        self.states = states
        self.actions = actions
        self.P = transition_probs  # P[s][a][s'] = prob
        self.R = rewards if rewards is not None else {s: {a: 0 for a in actions} for s in states}
        self.gamma = gamma
        self.theta = theta

    def set_rewards(self, new_rewards):
        """Update the rewards dynamically."""
        for s in self.states:
            if s not in self.R:
                self.R[s] = {}
            for a in self.actions:
                self.R[s][a] = new_rewards.get(s, {}).get(a, 0)

    def _evaluate_policy(self, policy):
        V = np.zeros(len(self.states))
        while True:
            delta = 0
            for s in self.states:
                a = policy[s]
                v = V[s]
                V[s] = sum(
                    self.P[s][a].get(s1, 0) * (self.R[s].get(a, 0) + self.gamma * V[s1])
                    for s1 in self.states
                )
                delta = max(delta, abs(v - V[s]))
            if delta < self.theta:
                break
        return V

    def _improve_policy(self, V, policy):
        policy_stable = True
        for s in self.states:
            old_action = policy[s]
            action_values = []
            for a in self.actions:
                value = sum(
                    self.P[s][a].get(s1, 0) * (self.R[s].get(a, 0) + self.gamma * V[s1])
                    for s1 in self.states
                )
                action_values.append(value)
            best_action = np.argmax(action_values)
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False
        return policy_stable

    def policy_iteration(self):
        policy = np.zeros(len(self.states), dtype=int)
        while True:
            V = self._evaluate_policy(policy)
            if self._improve_policy(V, policy):
                break
        return policy, V

def get_dynamic_input():
    num_states = int(input("Enter number of states: "))
    num_actions = int(input("Enter number of actions: "))

    states = list(range(num_states))
    actions = list(range(num_actions))

    print("\nEnter transition probabilities P[s][a][s'] (you'll be asked one value at a time):")
    transition_probs = {s: {a: {} for a in actions} for s in states}
    for s in states:
        for a in actions:
            print(f"\nFor state {s}, action {a}:")
            for s1 in states:
                while True:
                    try:
                        value = input(f"  P[{s}][{a}][{s1}] (enter 0 to skip): ")
                        prob = float(value)
                        if prob < 0 or prob > 1:
                            print("    ➤ Enter a probability between 0 and 1.")
                        else:
                            if prob > 0:
                                transition_probs[s][a][s1] = prob
                            break
                    except ValueError:
                        print("    ➤ Invalid input. Please enter a number.")

    print("\nEnter rewards R[s][a]:")
    rewards = {s: {} for s in states}
    for s in states:
        for a in actions:
            while True:
                try:
                    r = float(input(f"  R[{s}][{a}] = "))
                    rewards[s][a] = r
                    break
                except ValueError:
                    print("    ➤ Invalid input. Please enter a number.")

    return states, actions, transition_probs, rewards


# === Main Program ===
if __name__ == "__main__":
    print("Dynamic MDP Policy Iteration Setup")
    states, actions, transition_probs, rewards = get_dynamic_input()

    mdp = MDP(states, actions, transition_probs)
    mdp.set_rewards(rewards)

    policy, V = mdp.policy_iteration()

    print("\nPolicy Iteration Result:")
    for s in states:
        print(f"  π({s}) = {policy[s]}, V({s}) = {V[s]:.4f}")


Dynamic MDP Policy Iteration Setup
Enter number of states: 3
Enter number of actions: 2

Enter transition probabilities P[s][a][s'] (you'll be asked one value at a time):

For state 0, action 0:
  P[0][0][0] (enter 0 to skip): 1
  P[0][0][1] (enter 0 to skip): 2
    ➤ Enter a probability between 0 and 1.
  P[0][0][1] (enter 0 to skip): 0.23
  P[0][0][2] (enter 0 to skip): 0.4

For state 0, action 1:
  P[0][1][0] (enter 0 to skip): 0.6
  P[0][1][1] (enter 0 to skip): 0.55
  P[0][1][2] (enter 0 to skip): 0.2

For state 1, action 0:
  P[1][0][0] (enter 0 to skip): 0.7
  P[1][0][1] (enter 0 to skip): 0.4
  P[1][0][2] (enter 0 to skip): 0.2

For state 1, action 1:
  P[1][1][0] (enter 0 to skip): 0.4
  P[1][1][1] (enter 0 to skip): 0.22
  P[1][1][2] (enter 0 to skip): 0.5

For state 2, action 0:
  P[2][0][0] (enter 0 to skip): 0.6
  P[2][0][1] (enter 0 to skip): 0.55
  P[2][0][2] (enter 0 to skip): 0.3

For state 2, action 1:
  P[2][1][0] (enter 0 to skip): 0.6
  P[2][1][1] (enter 0 to skip)

  V[s] = sum(
  delta = max(delta, abs(v - V[s]))
