In [28]:
import numpy as np

class MDP:
    def __init__(self, S, A, gamma, mdptype, endstates, transitionprob, rewards):
        self.S = S #number of states
        self.A = A #number of actions
        self.gamma = gamma #gamma is the discount factor, is a value between 0 and 1 that represents the preference for current rewards over future rewards in the decision-making process.
        self.mdptype = mdptype
        self.endstates = endstates
        self.transitionprob = transitionprob #transition probability
        self.rewards = rewards

        if mdptype == "continuing":
            self.continuing_mdp_solving()
        else:
            self.episodic_mdp_solving()

    def continuing_mdp_solving(self):
        self.valuef = np.zeros(self.S) # valuef = value function for each state in the MDP. 
        self.ep = 1e-9 #ep=epsilon, a small positive value used as a threshold for convergence in the value iteration loop, determines the stopping criteria for the iterative process.
        self.optimact = np.zeros(self.S, dtype=int) #optimact=optimal action or policy, here is used as an array to store the policy for each state, represents the optimal action to take in each state. Here, optimact is an array where optimact[s] stores the optimal action to take in state s according to the current policy.

        self.flag = 0  # Initialize the flag variable

        while self.flag == 0:  # Use the flag in the loop condition
            
            self.diff = 0 # diff represents the maximum change in the values of the states during the value iteration process.
            self.valuef1 = self.valuef.copy()
            self.optimact1 = self.optimact.copy()
            for self.s in range(self.S):
                self.actions = []
                for self.a in range(self.A):
                    self.summation = 0
                    for self._s in range(self.S):
                        if self.transitionprob[self.s][self.a][self._s] != 0:
                            self.summation += (
                                self.transitionprob[self.s][self.a][self._s]
                                * (self.rewards[self.s][self.a][self._s] + self.gamma * self.valuef[self._s])
                            )
                    self.actions.append(self.summation)
                self.valuef1[self.s] = max(self.actions)
                self.optimact1[self.s] = np.argmax(self.actions)

                self.diff = max(self.diff, abs(self.valuef[self.s] - self.valuef1[self.s]))

            self.valuef = self.valuef1
            self.optimact = self.optimact1

            if self.diff < self.ep:
                
                self.flag = 1  # Set the flag to exit the loop

        output_file = f"sol-{self.mdptype}-mdp-{self.S}-{self.A}.txt"
        with open(output_file, 'w') as outfile:
            for i in range(self.S):
                outfile.write(f"{np.round(self.valuef[i], 6)} {self.optimact[i]}\n")

    def episodic_mdp_solving(self):
        self.valuef = np.zeros(self.S)
        self.ep = 1e-9
        self.optimact = np.zeros(self.S, dtype=int)

        self.flag = 0  # Initialize the flag variable

        while self.flag == 0:  # Use the flag in the loop condition
            
            self.diff = 0
            self.valuef1 = self.valuef.copy()
            self.optimact1 = self.optimact.copy()
            for self.s in range(self.S):

                self.actions = []
                if self.s in self.endstates:
                    self.valuef1[self.s] = 0
                    self.optimact1[self.s] = 0
                else:
                    for self.a in range(self.A):
                        self.summation = 0

                        for self._s in range(self.S):
                            if self.transitionprob[self.s][self.a][self._s] != 0:
                                self.summation += (
                                    self.transitionprob[self.s][self.a][self._s]
                                    * (self.rewards[self.s][self.a][self._s] + self.gamma * self.valuef[self._s])
                                )
                        self.actions.append(self.summation)
                    self.valuef1[self.s] = max(self.actions)
                    self.optimact1[self.s] = np.argmax(self.actions)

                    self.diff = max(self.diff, abs(self.valuef[self.s] - self.valuef1[self.s]))

            if self.diff < self.ep:
                
                self.flag = 1  # Set the flag to exit the loop

            self.valuef = self.valuef1
            self.optimact = self.optimact1

        output_file = f"sol-{self.mdptype}-mdp-{self.S}-{self.A}.txt"
        with open(output_file, 'w') as outfile: # write mode
            for i in range(self.S):
                outfile.write(f"{np.round(self.valuef[i], 6)} {self.optimact[i]}\n")

    def read_mdp_file(self, file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()
        return lines


if __name__ == "__main__":
    input_files = [
        "/Users/nehavpedgaonkar/Downloads/data 2/continuing-mdp-2-2.txt",
        "/Users/nehavpedgaonkar/Downloads/data 2/continuing-mdp-10-5.txt",
        "/Users/nehavpedgaonkar/Downloads/data 2/continuing-mdp-50-20.txt",
        "/Users/nehavpedgaonkar/Downloads/data 2/episodic-mdp-2-2.txt",
        "/Users/nehavpedgaonkar/Downloads/data 2/episodic-mdp-10-5.txt",
        "/Users/nehavpedgaonkar/Downloads/data 2/episodic-mdp-50-20.txt"
    ]
    for input_file in input_files:
        with open(input_file, 'r') as file: #read mode
            lines = file.readlines()

        S = int(lines[0].split()[1])
        A = int(lines[1].split()[1])
        endstates = lines[2].split()[1:]
        transitionprob = np.zeros((S, A, S))
        rewards = np.zeros((S, A, S))
        for line in lines[3:-2]:
            transitionprob[int(line.split()[1])][int(line.split()[2])][int(line.split()[3])] = float(line.split()[5])
            rewards[int(line.split()[1])][int(line.split()[2])][int(line.split()[3])] = float(line.split()[4]) 

        mdptype = lines[-2].split()[1]
        gamma = float(lines[-1].split()[1])

        mdp = MDP(S, A, gamma, mdptype, endstates, transitionprob, rewards)
