<a href="https://colab.research.google.com/github/Mehul-Agrawal410/AgentJackie_WiDS_2023/blob/main/Week_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

def parse_mdp_file(file_path):
    mdp_data = {}

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if parts[0] == 'transition':
                state, action, next_state, reward, probability = map(float, parts[1:])
                if state not in mdp_data:
                    mdp_data[state] = {}
                if action not in mdp_data[state]:
                    mdp_data[state][action] = []
                mdp_data[state][action].append((next_state, reward, probability))
            elif parts[0] == 'numStates':
                num_states = int(parts[1])
            elif parts[0] == 'numActions':
                num_actions = int(parts[1])
            elif parts[0] == 'discount':
                discount = float(parts[1])

    return num_states, num_actions, mdp_data, discount

def value_iteration(num_states, num_actions, mdp_data, discount, epsilon=1e-6, max_iterations=1000):
    V = np.zeros(num_states)

    for _ in range(max_iterations):
        delta = 0
        for s in range(num_states):
            v = V[s]
            action_values = []
            for a in range(num_actions):
                action_value = 0
                if s in mdp_data and a in mdp_data[s]:
                    for next_state, reward, probability in mdp_data[s][a]:
                        action_value += probability * (reward + discount * V[int(next_state)])
                action_values.append(action_value)

            V[s] = max(action_values)
            delta = max(delta, abs(v - V[s]))

        if delta < epsilon:
            break

    # Policy extraction
    policy = np.zeros(num_states, dtype=int)
    for s in range(num_states):
        action_values = []
        for a in range(num_actions):
            action_value = 0
            if s in mdp_data and a in mdp_data[s]:
                for next_state, reward, probability in mdp_data[s][a]:
                    action_value += probability * (reward + discount * V[int(next_state)])
            action_values.append(action_value)
        policy[s] = np.argmax(action_values)

    return V, policy

In [2]:
file_path = 'episodic-mdp-2-2.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 0.000000	π*(0) = 0
V*(1) = 1.455816	π*(1) = 0


In [3]:
file_path = 'episodic-mdp-10-5.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 0.000000	π*(0) = 0
V*(1) = 412.583851	π*(1) = 3
V*(2) = 412.877702	π*(2) = 4
V*(3) = 392.533051	π*(3) = 2
V*(4) = 367.978651	π*(4) = 1
V*(5) = 0.000000	π*(5) = 0
V*(6) = 410.142941	π*(6) = 2
V*(7) = 403.361613	π*(7) = 2
V*(8) = 275.520214	π*(8) = 4
V*(9) = 411.815005	π*(9) = 0


In [4]:
file_path = 'episodic-mdp-50-20.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 7.985539	π*(0) = 16
V*(1) = 7.837293	π*(1) = 9
V*(2) = 0.000000	π*(2) = 0
V*(3) = 7.664214	π*(3) = 18
V*(4) = 7.830739	π*(4) = 15
V*(5) = 7.826876	π*(5) = 12
V*(6) = 7.943425	π*(6) = 10
V*(7) = 8.261767	π*(7) = 4
V*(8) = 7.869689	π*(8) = 14
V*(9) = 8.348369	π*(9) = 5
V*(10) = 7.711353	π*(10) = 11
V*(11) = 7.775428	π*(11) = 0
V*(12) = 7.914739	π*(12) = 17
V*(13) = 8.006130	π*(13) = 16
V*(14) = 8.101705	π*(14) = 0
V*(15) = 8.089335	π*(15) = 15
V*(16) = 0.000000	π*(16) = 0
V*(17) = 7.652555	π*(17) = 9
V*(18) = 8.124856	π*(18) = 4
V*(19) = 7.843159	π*(19) = 15
V*(20) = 8.415758	π*(20) = 12
V*(21) = 7.321338	π*(21) = 9
V*(22) = 7.627953	π*(22) = 2
V*(23) = 7.984526	π*(23) = 7
V*(24) = 7.708907	π*(24) = 13
V*(25) = 7.777013	π*(25) = 10
V*(26) = 8.089615	π*(26) = 15
V*(27) = 5.340500	π*(27) = 18
V*(28) = 8.238759	π*(28) = 19
V*(29) = 7.855449	π*(29) = 6
V*(30) = 7.457376	π*(30) = 3
V*(31) = 7.829690	π*(31) = 0
V*(32) = 0.000000	π*(32) = 0
V*(33) = 7.660099	π*(33) = 17
V*(34) = 0.00000

In [5]:
file_path = 'continuing-mdp-2-2.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 5.999276	π*(0) = 0
V*(1) = 5.918427	π*(1) = 0


In [6]:
file_path = 'continuing-mdp-10-5.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 2.234956	π*(0) = 3
V*(1) = 2.373610	π*(1) = 3
V*(2) = 2.604045	π*(2) = 3
V*(3) = 2.647782	π*(3) = 1
V*(4) = 2.522229	π*(4) = 4
V*(5) = 2.375250	π*(5) = 0
V*(6) = 2.684804	π*(6) = 2
V*(7) = 2.688308	π*(7) = 0
V*(8) = 2.640807	π*(8) = 3
V*(9) = 2.572426	π*(9) = 1


In [7]:
file_path = 'continuing-mdp-50-20.txt'

num_states, num_actions, mdp_data, discount = parse_mdp_file(file_path)

V_star, pi_star = value_iteration(num_states, num_actions, mdp_data, discount)

for s in range(num_states):
    print(f'V*({s}) = {V_star[s]:.6f}\tπ*({s}) = {pi_star[s]}')

V*(0) = 1.065079	π*(0) = 7
V*(1) = 1.051696	π*(1) = 2
V*(2) = 0.824259	π*(2) = 7
V*(3) = 0.601320	π*(3) = 14
V*(4) = 1.057797	π*(4) = 4
V*(5) = 0.980877	π*(5) = 19
V*(6) = 0.983041	π*(6) = 18
V*(7) = 1.002595	π*(7) = 5
V*(8) = 0.886921	π*(8) = 15
V*(9) = 0.837798	π*(9) = 8
V*(10) = 1.109280	π*(10) = 8
V*(11) = 0.910305	π*(11) = 19
V*(12) = 1.155357	π*(12) = 7
V*(13) = 0.958098	π*(13) = 8
V*(14) = 0.772395	π*(14) = 18
V*(15) = 1.218694	π*(15) = 16
V*(16) = 0.939597	π*(16) = 11
V*(17) = 0.840961	π*(17) = 19
V*(18) = 0.934034	π*(18) = 2
V*(19) = 0.899851	π*(19) = 12
V*(20) = 1.168103	π*(20) = 14
V*(21) = 0.985183	π*(21) = 19
V*(22) = 1.032489	π*(22) = 14
V*(23) = 1.110618	π*(23) = 15
V*(24) = 0.779151	π*(24) = 0
V*(25) = 0.945382	π*(25) = 1
V*(26) = 1.185461	π*(26) = 3
V*(27) = 1.083733	π*(27) = 18
V*(28) = 0.697620	π*(28) = 15
V*(29) = 1.125198	π*(29) = 5
V*(30) = 0.556266	π*(30) = 1
V*(31) = 1.088646	π*(31) = 6
V*(32) = 0.829482	π*(32) = 11
V*(33) = 0.884322	π*(33) = 6
V*(34) = 1.180251