In [None]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_probabilities, rewards, discount_factor):
        self.states = states
        self.actions = actions
        self.transition_probabilities = transition_probabilities
        self.rewards = rewards
        self.discount_factor = discount_factor

    def get_transition_probabilities(self, state, action):
        return self.transition_probabilities[state][action]

    def get_reward(self, state, action):
        return self.rewards[state][action]

def value_iteration(mdp, epsilon=1e-6):

    V = {state: 0 for state in mdp.states}

    while True:
        delta = 0
        for state in mdp.states:
            max_value = float('-inf')
            for action in mdp.actions:
                expected_value = sum(prob * (mdp.get_reward(state, action) + mdp.discount_factor * V[next_state]) for next_state, prob in mdp.get_transition_probabilities(state, action).items())
                max_value = max(max_value, expected_value)

            delta = max(delta, abs(max_value - V[state]))
            V[state] = max_value

        if delta < epsilon:
            break

    policy = {}
    for state in mdp.states:
        best_action = None
        best_value = float('-inf')
        for action in mdp.actions:
            expected_value = sum(prob * (mdp.get_reward(state, action) + mdp.discount_factor * V[next_state]) for next_state, prob in mdp.get_transition_probabilities(state, action).items())
            if expected_value > best_value:
                best_value = expected_value
                best_action = action
        policy[state] = best_action

    return V, policy


def policy_evaluation(policy, mdp, V, epsilon=1e-6):
    while True:
        delta = 0
        for state in mdp.states:
            action = policy[state]
            expected_value = sum(prob * (mdp.get_reward(state, action) + mdp.discount_factor * V[next_state]) for next_state, prob in mdp.get_transition_probabilities(state, action).items())
            delta = max(delta, abs(expected_value - V[state]))
            V[state] = expected_value

        if delta < epsilon:
            break
    return V

def policy_iteration(mdp):

    policy = {state: np.random.choice(mdp.actions) for state in mdp.states}

    V = {state: 0 for state in mdp.states}

    while True:
        # Policy Evaluation
        V = policy_evaluation(policy, mdp, V)

        policy_stable = True

        # Policy Improvement
        for state in mdp.states:
            old_action = policy[state]
            action_values = {}
            for action in mdp.actions:
                action_values[action] = sum(prob * (mdp.get_reward(state, action) + mdp.discount_factor * V[next_state]) for next_state, prob in mdp.get_transition_probabilities(state, action).items())
            best_action = max(action_values, key=action_values.get)

            if best_action != old_action:
                policy_stable = False

            policy[state] = best_action

        if policy_stable:
            break

    return V, policy

# Defining states, action and reward
states = ['Hostel', 'Academic-Building', 'Canteen']
actions = ['attend', 'hungry']
transition_probabilities = {
    'Hostel': {'attend': {'Hostel': 0.5, 'Academic-Building': 0.5}, 'hungry': {'Canteen': 1.0}},
    'Academic-Building': {'attend': {'Academic-Building': 0.7, 'Canteen': 0.3}, 'hungry': {'Academic-Building': 0.2, 'Canteen': 0.8}},
    'Canteen': {'attend': {'Hostel': 0.3, 'Academic-Building': 0.6, 'Canteen': 0.1}, 'hungry': {'Canteen': 1}}
}
rewards = {
    'Hostel': {'attend': -1, 'hungry': -1},
    'Academic-Building': {'attend': 3, 'hungry': 3},
    'Canteen': {'attend': 1, 'hungry': 1}
}
discount_factor = 0.9

mdp = MDP(states, actions, transition_probabilities, rewards, discount_factor)


V, policy = policy_iteration(mdp)
print("SOLUTION BY POLICY ITERATION:-")
print()
print("Optimal Value Function:", V)
print("Optimal Policy:", policy)
print()
print("-------------------------------------")
print()
print("SOLUTION BY VALUE ITERATION:-")
print()
V, policy = value_iteration(mdp)
print("Optimal Value Function:", V)
print("Optimal Policy:", policy)

SOLUTION BY POLICY ITERATION:-

Optimal Value Function: {'Hostel': 16.056233536137125, 'Academic-Building': 21.846509538726654, 'Canteen': 18.826701241558315}
Optimal Policy: {'Hostel': 'attend', 'Academic-Building': 'attend', 'Canteen': 'attend'}

-------------------------------------

SOLUTION BY VALUE ITERATION:-

Optimal Value Function: {'Hostel': 16.056233839779527, 'Academic-Building': 21.846509827444127, 'Canteen': 18.826701506623575}
Optimal Policy: {'Hostel': 'attend', 'Academic-Building': 'attend', 'Canteen': 'attend'}
