<a href="https://colab.research.google.com/github/Jorayala/AI_Machine_Learning_2024/blob/main/Construccion_de_agentes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

class Environment:
    # Definición del ambiente sin los métodos que serán manejados por Learner.
    def __init__(self, size=5):
        self.size = size
        self.goal_state = size - 1

    def start(self):
        # Retorna el estado inicial.
        return 0

    def end(self):
        # Retorna el estado objetivo.
        return self.goal_state

class Agent:
    # Agente que puede moverse hacia adelante o hacia atrás.
    def __init__(self, right_bound):
        self.state = 0
        self.right_bound = right_bound
        self.actions = ['forward', 'back']

    def reset(self):
        self.state = 0

    def move(self, action):
        # Acciones para mover al agente.
        if action == 'forward' and self.state < self.right_bound:
            self.state += 1
        elif action == 'back' and self.state > 0:
            self.state -= 1
        return self.state

    def get_action_name(self, action_number):
        # Retorna el nombre de la acción basada en el número de acción.
        return self.actions[action_number]

class Learner:
    # Clase Learner que manejará el aprendizaje del agente.
    def __init__(self, agent, env, alpha=0.1, gamma=0.6, epsilon=0.1):
        self.agent = agent
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros((env.size, len(agent.actions)))

    def _init_table(self):
        # Inicializa la tabla Q con ceros.
        self.q_table = np.zeros((self.env.size, len(self.agent.actions)))

    def run(self):
        done = False
        while not done:
            current_state = self.agent.state
            if random.uniform(0, 1) < self.epsilon:
                # Exploración: elige una acción aleatoria.
                action = random.choice(self.agent.actions)
            else:
                # Explotación: elige la mejor acción basada en q_table.
                action = self.agent.get_action_name(np.argmax(self.q_table[current_state]))
            next_state = self.agent.move(action)
            reward, done = self.get_reward(next_state)
            old_value = self.q_table[current_state, self.agent.actions.index(action)]
            next_max = np.max(self.q_table[next_state])
            new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
            self.q_table[current_state, self.agent.actions.index(action)] = new_value

    def random_action(self):
        return random.choice(range(len(self.agent.actions)))

    def get_reward(self, state):
        if state == self.env.end():
            return 10, True
        else:
            return 0, False

def main():
    # Configuración y ejecución del entorno de aprendizaje.
    env = Environment(size=5)
    agent = Agent(right_bound=env.end())
    learner = Learner(agent, env)

    episodes = 10
    for episode in range(episodes):
        print(f"Episode {episode + 1}:")
        agent.reset()
        learner.run()
        print(f"Q-Table after episode {episode + 1}:\n{learner.q_table}\n")

main()


Episode 1:
Q-Table after episode 1:
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 0.]
 [0. 0.]]

Episode 2:
Q-Table after episode 2:
[[0.   0.  ]
 [0.   0.  ]
 [0.06 0.  ]
 [1.9  0.  ]
 [0.   0.  ]]

Episode 3:
Q-Table after episode 3:
[[0.     0.    ]
 [0.0036 0.    ]
 [0.168  0.    ]
 [2.71   0.    ]
 [0.     0.    ]]

Episode 4:
Q-Table after episode 4:
[[4.104e-04 0.000e+00]
 [1.332e-02 1.296e-05]
 [3.138e-01 0.000e+00]
 [3.439e+00 0.000e+00]
 [0.000e+00 0.000e+00]]

Episode 5:
Q-Table after episode 5:
[[1.16856e-03 0.00000e+00]
 [3.08160e-02 1.29600e-05]
 [4.88760e-01 0.00000e+00]
 [4.09510e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00]]

Episode 6:
Q-Table after episode 6:
[[2.900664e-03 0.000000e+00]
 [5.706000e-02 1.296000e-05]
 [6.855900e-01 0.000000e+00]
 [4.685590e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]]

Episode 7:
Q-Table after episode 7:
[[6.0341976e-03 0.0000000e+00]
 [9.2489400e-02 1.2960000e-05]
 [8.9816640e-01 0.0000000e+00]
 [5.2170310e+00 0.0000000e+00]
 [0.0000000e+00 0.0000