## Laboratorio # 3 - RL
### Task 2

In [26]:
import numpy as np
import random

In [27]:
class InventoryEnvironment:
    def __init__(self):
        self.products = ['product_A', 'product_B']
        self.max_stock = 10
        self.demand = {'product_A': [0, 1, 2], 'product_B': [0, 1, 2]}
        self.restock_cost = {'product_A': 5, 'product_B': 7}
        self.sell_price = {'product_A': 10, 'product_B': 15}
        self.state = None

    def reset(self):
        self.state = {product: random.randint(0, self.max_stock) for product in self.products}
        return self.state

    def step(self, action):
        reward = 0
        for product in self.products:
            stock = self.state[product]
            restock = action[product]
            self.state[product] = min(self.max_stock, stock + restock)
            demand = random.choice(self.demand[product])
            sales = min(demand, self.state[product])
            self.state[product] -= sales
            reward += sales * self.sell_price[product] - restock * self.restock_cost[product]
        return self.state, reward

In [28]:
def generate_episode(env, epsilon=0.1):
    state = env.reset()
    episode = []
    done = False
    while not done:
        action = {}
        for product in env.products:
            if random.uniform(0, 1) < epsilon:
                action[product] = random.randint(0, env.max_stock)
            else:
                # Acción predeterminada: reponer a la mitad del stock máximo
                action[product] = env.max_stock // 2
        new_state, reward = env.step(action)
        episode.append((state, action, reward, new_state))
        state = new_state
        done = True # Para simplificar, terminamos el episodio después de un paso
    return episode

In [29]:
# Implementación del aprendizaje off-policy con Q-learning
def q_learning(env, num_episodes, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = {}
    for product in env.products:
        Q[product] = np.zeros((env.max_stock + 1, env.max_stock + 1))
    
    for _ in range(num_episodes):
        episode = generate_episode(env, epsilon=epsilon)
        for state, action, reward, new_state in episode:
            for product in env.products:
                old_value = Q[product][state[product], action[product]]
                next_max = np.max(Q[product][new_state[product]])
                Q[product][state[product], action[product]] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    return Q

In [30]:
env = InventoryEnvironment()
Q_values = q_learning(env, num_episodes=1000)

# Mostrar los valores Q aprendidos
for product in env.products:
    print(f"Q-values for {product}:")
    print(Q_values[product])

NameError: name 'policy' is not defined