### MARL ASSI 1: Question 1

Ananya Gandhi (20319)

In [14]:
import numpy as np
from typing import Dict, List, Tuple

In [15]:
class MarkovDecisionProcess:
    def __init__(self, states: List[str], actions: List[str], 
                 transition_probs: Dict[str, Dict[str, List[Tuple[str, float]]]], 
                 rewards: Dict[str, float], discount_factor: float):
        self.states = states
        self.actions = actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.discount_factor = discount_factor

In [16]:
class ValueIteration:
    def __init__(self, mdp: MarkovDecisionProcess, threshold: float = 1e-5):
        self.mdp = mdp
        self.threshold = threshold

    def run(self) -> Tuple[Dict[str, float], Dict[str, str]]:
        V = {state: 0 for state in self.mdp.states}
        policy = {state: None for state in self.mdp.states}

        while True:
            delta = 0
            for state in self.mdp.states:
                old_v = V[state]
                V[state], policy[state] = self._get_max_action_value(state, V)
                delta = max(delta, abs(old_v - V[state]))
            
            if delta < self.threshold:
                break

        return V, policy

    def _get_max_action_value(self, state: str, V: Dict[str, float]) -> Tuple[float, str]:
        return max(
            (self._calculate_action_value(state, action, V), action)
            for action in self.mdp.actions
        )

    def _calculate_action_value(self, state: str, action: str, V: Dict[str, float]) -> float:
        return self.mdp.rewards[state] + self.mdp.discount_factor * sum(
            prob * V[next_state]
            for next_state, prob in self.mdp.transition_probs[state].get(action, [])
        )


In [17]:
class PolicyIteration:
    def __init__(self, mdp: MarkovDecisionProcess, threshold: float = 1e-5):
        self.mdp = mdp
        self.threshold = threshold

    def run(self) -> Dict[str, str]:
        policy = {state: np.random.choice(self.mdp.actions) for state in self.mdp.states}

        while True:
            V = self._policy_evaluation(policy)
            if self._policy_improvement(policy, V):
                break

        return policy

    def _policy_evaluation(self, policy: Dict[str, str]) -> Dict[str, float]:
        V = {state: 0 for state in self.mdp.states}
        
        while True:
            delta = 0
            for state in self.mdp.states:
                old_v = V[state]
                action = policy[state]
                V[state] = self._calculate_state_value(state, action, V)
                delta = max(delta, abs(old_v - V[state]))
            
            if delta < self.threshold:
                break

        return V

    def _policy_improvement(self, policy: Dict[str, str], V: Dict[str, float]) -> bool:
        policy_stable = True
        
        for state in self.mdp.states:
            old_action = policy[state]
            policy[state] = max(
                self.mdp.actions,
                key=lambda a: self._calculate_action_value(state, a, V)
            )
            if old_action != policy[state]:
                policy_stable = False
        
        return policy_stable

    def _calculate_state_value(self, state: str, action: str, V: Dict[str, float]) -> float:
        return self.mdp.rewards[state] + self.mdp.discount_factor * sum(
            prob * V[next_state]
            for next_state, prob in self.mdp.transition_probs[state].get(action, [])
        )

    def _calculate_action_value(self, state: str, action: str, V: Dict[str, float]) -> float:
        return self.mdp.rewards[state] + self.mdp.discount_factor * sum(
            prob * V[next_state]
            for next_state, prob in self.mdp.transition_probs[state].get(action, [])
        )

In [18]:
states = ["Hostel", "Academic_Building", "Canteen"]

In [19]:
actions = ["Class", "Eat"]

In [20]:
transition_probs = {
    "Hostel": {
        "Class": [("Hostel", 0.5), ("Academic_Building", 0.5)],
        "Eat": [("Canteen", 1.0)]
    },
    "Academic_Building": {
        "Class": [("Academic_Building", 0.7), ("Canteen", 0.3)],
        "Eat": [("Canteen", 0.8), ("Academic_Building", 0.2)]
    },
    "Canteen": {
        "Class": [("Academic_Building", 0.6), ("Hostel", 0.3), ("Canteen", 0.1)],
        "Eat": [("Canteen", 1.0)]
    }
}

In [21]:
rewards = {
    "Hostel": -1,
    "Academic_Building": 3,
    "Canteen": 1
}

In [22]:
discount_factor = 0.9

In [23]:
mdp = MarkovDecisionProcess(states, actions, transition_probs, rewards, discount_factor)

In [24]:
# Run Value Iteration
vi = ValueIteration(mdp)
optimal_values, optimal_policy_value_iteration = vi.run()
print("Optimal Values from Value Iteration:", optimal_values)
print("Optimal Policy from Value Iteration:", optimal_policy_value_iteration)


Optimal Values from Value Iteration: {'Hostel': 16.05617127561447, 'Academic_Building': 21.846450338494776, 'Canteen': 18.826646891106996}
Optimal Policy from Value Iteration: {'Hostel': 'Class', 'Academic_Building': 'Class', 'Canteen': 'Class'}


In [25]:
# Run Policy Iteration
pi = PolicyIteration(mdp)
optimal_policy_policy_iteration = pi.run()
print("Optimal Policy from Policy Iteration:", optimal_policy_policy_iteration)

Optimal Policy from Policy Iteration: {'Hostel': 'Class', 'Academic_Building': 'Class', 'Canteen': 'Class'}
