In [None]:
import numpy as np

In [None]:
class MDP:
    def __init__(self, file_path):
        self.read_mdp_file(file_path)

    def read_mdp_file(self, file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()

        self.num_states = int(lines[0].split()[1])
        self.num_actions = int(lines[1].split()[1])
        self.end_states = lines[2].split()[1:]
        self.probabilities = np.zeros((self.num_states, self.num_actions, self.num_states))
        self.rewards = np.zeros((self.num_states, self.num_actions, self.num_states))

        for line in lines[3:-2]:
            s1 = int(line.split()[1])
            a = int(line.split()[2])
            s2 = int(line.split()[3])
            self.probabilities[s1][a][s2] = float(line.split()[5])
            self.rewards[s1][a][s2] = float(line.split()[4])

        self.mdptype = lines[-2].split()[1]
        self.discount = float(lines[-1].split()[1])

    def solve_continuous_mdp(self):
        val_func = np.zeros(self.num_states)
        epsilon = 1e-6
        while True:
            difference = 0
            val_func2 = val_func.copy()

            for s in range(self.num_states):
                actions = []
                for a in range(self.num_actions):
                    E_a = sum(
                        self.probabilities[s][a][s2] * (self.rewards[s][a][s2] + self.discount * val_func[s2])
                        for s2 in range(self.num_states)
                    )
                    actions.append(E_a)
                 #assign the action to the corresponding state which returns highest value 
                val_func2[s] = max(actions)
                difference = max(difference, abs(val_func[s] - val_func2[s]))

            val_func = val_func2

            if difference < epsilon:
                break

        policy = np.argmax(self.compute_q_values(val_func), axis=1)
        self.write_output(val_func, policy)

    def solve_episodic_mdp(self):
        val_func = np.zeros(self.num_states)
        epsilon = 1e-6

        while True:
            difference = 0
            val_func2 = val_func.copy()

            for s in range(self.num_states):
                if s in self.end_states:
                    val_func2[s] = 0
                else:
                    actions = []
                    for a in range(self.num_actions):
                        E_a = sum(
                            self.probabilities[s][a][s2] * (self.rewards[s][a][s2] + self.discount * val_func[s2])
                            for s2 in range(self.num_states)
                        )
                        actions.append(E_a)
                    #assign the action to the corresponding state which returns highest value 
                    val_func2[s] = max(actions)
                    difference = max(difference, abs(val_func[s] - val_func2[s]))

            val_func = val_func2

            if difference < epsilon:
                break

        policy = np.argmax(self.compute_q_values(val_func), axis=1)
        self.write_output(val_func, policy)

    def compute_q_values(self, val_func):
        q_values = np.zeros((self.num_states, self.num_actions))
        for s in range(self.num_states):
            for a in range(self.num_actions):
                q_values[s][a] = sum(
                    self.probabilities[s][a][s2] * (self.rewards[s][a][s2] + self.discount * val_func[s2])
                    for s2 in range(self.num_states)
                )
        return q_values

    def write_output(self, val_func, policy):
        output_file = f"output-{self.mdptype}-mdp-{self.num_states}-{self.num_actions}.txt"
        with open(output_file, 'w') as outfile:
            for i in range(self.num_states):
                outfile.write(f"{np.round(val_func[i], 6)} {policy[i]}\n")

In [None]:
if __name__ == "__main__":
    input_files = ["continuing-mdp-50-20.txt", "episodic-mdp-50-20.txt","continuing-mdp-2-2.txt"]
    for input_file in input_files:
        mdp = MDP(input_file)
        mdp.read_mdp_file(input_file)

        # Call appropriate solving function based on mdptype
        if mdp.mdptype == "continuing":
            mdp.solve_continuous_mdp()
        else:
            mdp.solve_episodic_mdp()