<a href="https://colab.research.google.com/github/Jorayala/AI_Machine_Learning_2024/blob/main/MDPs_por_medio_de_iteraci%C3%B3n_de_valores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random

# Definicion de la clase principal
class MDP:
    def __init__(self, table, dimensions, initial_state):
        # your code here
        self.nrows, self.ncols = dimensions
        self.table = table
        self.initial_state = initial_state
        self.state = initial_state
        self.actions = ['slow', 'fast']
        # Inicializar los atributos transitions y rewards como listas de listas
        self.transitions = [[[] for _ in range(self.ncols)] for _ in range(self.nrows)]
        self.rewards = [[[] for _ in range(self.ncols)] for _ in range(self.nrows)]

        # Procesar la tabla para separar transiciones y recompensas
        for i in range(self.nrows):
            for j in range(self.ncols):
                if table[i][j] is not None:
                    for prob, next_state, reward in table[i][j]:
                        self.transitions[i][j].append((prob, next_state))
                        self.rewards[i][j].append((reward, next_state))


    def get_current_state(self):
        # your code here
        return self.state


    def get_possible_actions(self, state):
        # your code here
        possible_actions = []
        for action_index in range(self.ncols):
            if self.table[state][action_index] is not None:
                possible_actions.append(self.actions[action_index])
        return possible_actions


    def get_possible_states(self, action):
        action_index = self.actions.index(action)
        transitions = self.table[self.state][action_index]
        if transitions is None:
            return [], [], []
        probabilities, rewards, next_states = zip(*transitions)
        return probabilities, rewards, next_states

    def do_action(self, action):
        # your code here
        if self.is_terminal():
            print("Se daño el carro.")
            return None, self.state

        if action not in self.actions:
            print("Acción no correcta.")
            return None, self.state

        action_index = self.actions.index(action)
        transitions = self.table[self.state][action_index]

        if transitions is None:
            print("No hay transiciones para esta accion.")
            return None, self.state

        # probabilidades acumuladas
        cumulative_probabilities = []
        cumulative = 0.0
        for t in transitions:
            cumulative += t[0]
            cumulative_probabilities.append(cumulative)
        # transición basada en probabilidades
        rand = random.random()
        for i, cumulative_prob in enumerate(cumulative_probabilities):
            if rand <= cumulative_prob:
                next_state = transitions[i][1]
                reward = transitions[i][2]
                break

        self.state = next_state
        return reward, next_state


    def reset(self):
        # your code here
        self.state = self.initial_state


    def is_terminal(self):
        # your code here
        return len(self.get_possible_actions(self.state)) == 0



In [2]:

import random

class ValueIteration:
    def __init__(self, mdp, discount=0.9, iterations=30):

        # Validaciones de tipos y valores
        if not isinstance(mdp, MDP):
            raise TypeError("mdp debe ser una instancia de la clase MDP")
        if not isinstance(discount, float) or discount <= 0 or discount > 1:
            raise ValueError("discount debe ser un float entre 0 (exclusivo) y 1 (inclusivo)")
        if not isinstance(iterations, int) or iterations <= 0:
            raise ValueError("iterations debe ser un entero positivo")

        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = {}  # Inicializar diccionario vacío

    def get_value(self, state):

        return self.values.get(state, 0)

    def compute_new_value(self, state):

        # Si es estado terminal, retornar 0
        if not self.mdp.get_possible_actions(state):
            return 0

        max_value = float('-inf')

        # Para cada acción posible en el estado
        for action in self.mdp.get_possible_actions(state):
            value = 0
            action_index = self.mdp.actions.index(action)

            # Obtener transiciones para esta acción
            transitions = self.mdp.table[state][action_index]
            if transitions:
                # Calcular valor esperado según la ecuación de Bellman
                for prob, next_state, reward in transitions:
                    value += prob * (reward + self.discount * self.get_value(next_state))

            max_value = max(max_value, value)

        return max_value


    def get_action(self, state):
        best_value = float('-inf')
        best_action = None
        possible_actions = self.mdp.get_possible_actions(state)
        for action in possible_actions:
            probabilities, rewards, next_states = self.mdp.get_possible_states(action)
            value = 0
            for prob, reward, next_state in zip(probabilities, rewards, next_states):
                value += prob * (reward + self.discount * self.get_value(next_state))
            if value > best_value:
                best_value = value
                best_action = action
        return best_action

    def run_value_iteration(self):
        """
        Ejecuta el algoritmo de iteración de valores
        """
        for i in range(self.iterations):
            new_values = {}

            # Actualizar valores para todos los estados
            for state in range(self.mdp.nrows):
                new_values[state] = self.compute_new_value(state)

            self.values = new_values
            print(f"Iteración {i+1}: {self.values}")


In [3]:
# Definimos un ejemplo del ambiente para probar la clase ValueIteration
table = [[[(1,0,1)], [(0.5, 0, 2), (0.5,1,2)]],
         [[(0.5, 0, 1), (0.5,1,1)], [(1, 2, -10)]],
         [None, None]]

env = MDP(table, (3,2), 0)

In [4]:

try:
    a.mdp
    assert type(a.mdp) is MDP, "El tipo del mdp debe ser MDP (el tipo de la clase)"
except:
    print("El atributo mdp no está definido")
try:
    a.discount
    assert type(a.discount) is float, "El tipo del factor de descuento debe ser float"
    assert 0 < a.discount and a.discount <=1, "El factor de descuento debe ser un valor entre 0 (excluido) y 1 (incluido)"
except:
    print("El atributo discount no está definido")
try:
    a.iterations
    assert type(a.iterations) is int, "El tipo de la cantidad de iteraciones debe ser entero"
except:
    print("El atributo iterations no está definido")

try:
    a.values
    assert type(a.values) is dict or type(a.values) is dict[int,float], "El tipo del mapa de valores debe ser dict (el tipo de los mapas en python). El mapa puede estar instanciado en su llave y valor si fue inicializado previamente"
except:
    print("El atributo values no está definido")

El atributo mdp no está definido
El atributo discount no está definido
El atributo iterations no está definido
El atributo values no está definido
