<a href="https://colab.research.google.com/github/ManishSuhas0026/Reinforcement-Learning/blob/main/530_RL_Lab_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

In [3]:
class MDP:
    def __init__(self, states, actions, transition_probs, rewards, gamma):
        self.states = states
        self.actions = actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.gamma = gamma

    def get_transition_prob(self, state, action, next_state):
        return self.transition_probs[state][action].get(next_state, 0)

    def get_reward(self, state, action):
        return self.rewards.get(state, {}).get(action, 0)

In [4]:
def value_iteration(mdp, threshold=1e-6):
    # Initialize values arbitrarily, typically to zero
    values = {state: 0 for state in mdp.states}
    policy = {state: None for state in mdp.states}

    while True:
        max_delta = 0
        new_values = values.copy()

        for state in mdp.states:
            max_value = float('-inf')
            best_action = None

            # Check all possible actions to find the best one
            for action in mdp.actions:
                # Calculate expected value of taking action 'a' in state 's'
                action_value = sum(
                    mdp.get_transition_prob(state, action, next_state) *
                    (mdp.get_reward(state, action) + mdp.gamma * values[next_state])
                    for next_state in mdp.states
                )

                # Find the action with the maximum value
                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            # Update the value of the state
            new_values[state] = max_value
            policy[state] = best_action

            # Calculate the difference for convergence check
            max_delta = max(max_delta, abs(new_values[state] - values[state]))

        # Update values for the next iteration
        values = new_values

        # Stop if the values have converged
        if max_delta < threshold:
            break

    return policy, values

In [5]:
states = ['A', 'B', 'C']
actions = ['left', 'right']
transition_probs = {
    'A': {'left': {'A': 1.0}, 'right': {'B': 1.0}},
    'B': {'left': {'A': 0.5, 'B': 0.5}, 'right': {'C': 1.0}},
    'C': {'left': {'B': 1.0}, 'right': {'C': 1.0}}
}
rewards = {
    'A': {'left': 0, 'right': 1},
    'B': {'left': 0, 'right': 2},
    'C': {'left': 0, 'right': 3}
}
gamma = 0.9

# Create an MDP instance
mdp = MDP(states, actions, transition_probs, rewards, gamma)

# Run value iteration
optimal_policy, optimal_values = value_iteration(mdp)

print("Optimal Policy:")
for state, action in optimal_policy.items():
    print(f"  State {state}: {action}")

print("\nOptimal Values:")
for state, value in optimal_values.items():
    print(f"  State {state}: {value}")

Optimal Policy:
  State A: right
  State B: right
  State C: right

Optimal Values:
  State A: 27.09999141381772
  State B: 28.99999141381772
  State C: 29.99999141381772
