In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import itertools
import os
from collections import namedtuple
import logging
import pandas as pd

class AnaquelEnv:
    def __init__(self, df, rows=3, cols=7):
        self.df = df.copy()
        self.df_iterations = df.copy()
        self.weight_matrix = np.array([
            [5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5, 5.5, 4.7, 3.5, 3.0, 2.0, 1.3, 1.0, 1.0, 1.3, 2.0, 3.0, 3.5, 4.7, 5.5],
            [5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0, 5.0, 4.3, 3.0, 2.7, 1.6, 1.0, 0.7, 0.7, 1.0, 1.6, 2.7, 3.0, 4.3, 5.0],
            [4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3, 4.3, 3.6, 2.5, 2.0, 1.3, 0.7, 0.5, 0.5, 0.7, 1.3, 2.0, 2.5, 3.6, 4.3],
            [9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0, 9.0, 7.7, 6.5, 5.5, 5.0, 4.3, 4.0, 4.0, 4.3, 5.0, 5.5, 6.5, 7.7, 9.0],
            [9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8, 9.8, 8.5, 7.0, 6.0, 5.5, 4.7, 4.3, 4.3, 4.7, 5.5, 6.0, 7.0, 8.5, 9.8],
            [10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5, 10.5, 9.0, 7.7, 6.5, 6.0, 5.0, 4.7, 4.7, 5.0, 6.0, 6.5, 7.7, 9.0, 10.5],
        ])
        self.rows = self.weight_matrix.shape[0]
        self.cols = self.weight_matrix.shape[1]
        # Matrix of weights (higher values indicate higher cost to place an item)

        self.avail_matrix = np.zeros(self.weight_matrix.shape)
        self.products_id = np.zeros(self.weight_matrix.shape)
        self.products_id.fill(-1)
        
        # Mapping product IDs to indexes for one-hot encoding
        unique_products = df['PRODUCTO'].unique()
        self.product_id_to_index = {pid: idx for idx, pid in enumerate(unique_products)}
        self.num_products = len(unique_products)

        # State representations
        self.state_quantities = np.zeros(self.weight_matrix.shape)  
        self.state_space = self.rows * self.cols  # Total possible states
        self.action_space = rows * cols * self.num_products # Total possible placements
        self.available_cells = [(i, j) for i in range(self.rows) for j in range(self.cols) if self.avail_matrix[i, j] == 0]

    def reset(self):
        """Reset environment for new episode"""
        self.df_iterations = self.df.copy()
        self.state_quantities.fill(0)
        self.avail_matrix.fill(0)
        self.products_id.fill(-1)
        return self.state_quantities, self.products_id
    
    def step(self, action):
        """Perform an action and return next state, reward, and done flag"""
        action = action

        item, cell = divmod(action, self.num_products)
        #print(item, cell, self.avail_matrix.shape, self.num_products)
        row,col = self.available_cells[cell]
        product_id = self.df_iterations.iloc[item]['PRODUCTO']
        quantity = self.df_iterations.iloc[item]['UNDESTIMADAS']
        #print(row, col, product_id, quantity, item)
        if self.avail_matrix[row, col] == 0:  # If cell is empty
            self.products_id[row, col] = product_id
            self.state_quantities[row, col] = quantity
            self.avail_matrix[row, col] = 1

        reward = self.compute_reward(row, col)
        done = self.is_done()

        return cell, reward, done


    def compute_reward(self, row, col):
        """Reward function: balance zones, prioritize high-demand items in front"""
        #print(row, col, self.avail_matrix[row, col], self.state_quantities[row, col], self.weight_matrix[row, col])
        if self.avail_matrix[row, col] == -1:
            return -5000
        return - self.state_quantities[row, col] * self.weight_matrix[row, col]

    def is_done(self):
        items_placed = self.avail_matrix.sum()
        return items_placed == self.num_products  # Done when all cells are filled


2025-03-12 10:10:17.490346: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-12 10:10:17.497639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741792217.506307   79545 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741792217.508802   79545 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-12 10:10:17.517912: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
file_path = 'productos_anaquel.xls'
df_ = []
i = 1

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('log.txt')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

logger.info('Reading file...')

try:
    while True:
        df_.append(pd.read_excel(file_path, sheet_name=f"Sheet {i}"))
        i += 1
except:
    pass

df_ = pd.concat(df_, ignore_index=True)
df = df_[df_['ANAQUEL'].str.startswith('C', na=False)]
df = df[df['CAMPA'] == 201416]
df.reset_index(drop=True, inplace=True)



In [3]:
class QNetwork(tf.keras.Model):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(256, activation='relu')
        self.fc3 = tf.keras.layers.Dense(128, activation='relu')
        self.fc4 = tf.keras.layers.Dense(output_dim, activation=None)  # No activation, raw Q-values

    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)
        return self.fc4(x)

In [4]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: A Q-Network that returns Q-values for a given state.
        nA: Number of actions in the environment.

    Returns:
        A function that takes (sess, observation, epsilon) and returns
        probabilities for each action as a numpy array of length nA.
    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA  # Uniform exploration probability
        q_values = estimator(tf.expand_dims(observation, axis=0))[0].numpy()  # Get Q-values
        best_action = np.argmax(q_values)  # Choose best action
        A[best_action] += (1.0 - epsilon)  # Favor best action
        return A
    return policy_fn


In [5]:
def update_target_network(q_network, target_q_network):
    target_q_network.set_weights(q_network.get_weights())

In [None]:
def deep_q_learning(sess,
                    env: AnaquelEnv,
                    q_estimator: QNetwork,
                    target_estimator: QNetwork,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=5000,
                    replay_memory_init_size=1000,
                    update_target_estimator_every=20,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=50000,
                    batch_size=32):

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    replay_memory = deque(maxlen=replay_memory_size)
    rewards_list = []
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    # Create directories for saving models
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, "model.weights.h5")

    # Create epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # Define ε-greedy policy
    policy = make_epsilon_greedy_policy(q_estimator, env.action_space)

    # Populate replay memory with initial random experience
    print("Populating replay memory...")
    env.reset()
    # to be understand
    state = np.zeros(env.state_space)

    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(i, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        cell, reward, done = env.step((action))
        next_state = state.copy()
        next_state[cell] = 1

        replay_memory.append(Transition(state, action, reward, next_state, done))

        if done:
            env.reset()
            state = np.zeros(env.state_space)
        else:
            state = next_state
        #print(next_state.shape, state.shape)

    print("Replay memory initialized.")

    for episode in range(num_episodes):
        env.reset()
        state = np.zeros(env.state_space)

        total_reward = 0
        done = False
        step_count = 0

        while not done:
            epsilon = epsilons[min(step_count, epsilon_decay_steps - 1)]
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

            cell, reward, done = env.step((action))
            next_state = state.copy()
            next_state[cell] = 1
            #print(next_state.shape, state.shape)
            
            replay_memory.append(Transition(state, action, reward, next_state, done))

            if len(replay_memory) >= batch_size:
                batch = random.sample(replay_memory, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                states = tf.convert_to_tensor(np.array(states), dtype=tf.float32)
                actions = tf.convert_to_tensor(actions, dtype=tf.int32)
                rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
                next_states = tf.convert_to_tensor(np.array(next_states), dtype=tf.float32)
                dones = tf.convert_to_tensor(np.array(dones, dtype=np.float32), dtype=tf.float32)

                with tf.GradientTape() as tape:
                    q_values = q_estimator(states)
                    q_values = tf.gather(q_values, actions, batch_dims=1)

                    next_q_values = target_estimator(next_states)
                    max_next_q_values = tf.reduce_max(next_q_values, axis=1)
                    targets = rewards + discount_factor * max_next_q_values * (1 - dones)
                    
                    loss = tf.keras.losses.MSE(targets, q_values)

                grads = tape.gradient(loss, q_estimator.trainable_variables)
                optimizer.apply_gradients(zip(grads, q_estimator.trainable_variables))

            state = next_state
            total_reward += reward
            step_count += 1
            #print(f"Step {step_count}, Reward: {reward}, Epsilon: {epsilon:.4f}, done: {done}, cell: {cell}")

        rewards_list.append(total_reward)

        # Update target network every few episodes
        if episode % update_target_estimator_every == 0:
            target_estimator.set_weights(q_estimator.get_weights())

        # Save model checkpoint
        q_estimator.save_weights(checkpoint_path)

        print(f"Episode {episode+1}, Reward: {total_reward}, Epsilon: {epsilon:.4f}")

    return rewards_list


In [7]:
experiment_dir = "./experiments"

checkpoint_path = os.path.join(experiment_dir, 'checkpoints', "model.weights.h5")

# Initialize environment and state processor
env = AnaquelEnv(df)

# Get input and output dimensions
num_products = env.num_products
input_dim = (env.rows * env.cols) * (1 + num_products)
output_dim = env.action_space

# Create Q-networks (online & target)
q_network = QNetwork(input_dim, output_dim)
target_q_network = QNetwork(input_dim, output_dim)
target_q_network.set_weights(q_network.get_weights())  # Sync weights initially

In [8]:
if os.path.exists(checkpoint_path):
    print("Loading saved weights...")
    dummy_input = tf.random.uniform((1, input_dim))  # Create dummy input
    q_network(dummy_input)  # Forward pass to initialize model
    target_q_network(dummy_input)

    # Now load weights
    q_network.load_weights(checkpoint_path)
    target_q_network.load_weights(checkpoint_path)
else:
    print("No saved model found! Train the model first.")

No saved model found! Train the model first.


In [9]:
# Train the agent
rewards_list = deep_q_learning(None, env, q_network, target_q_network, num_episodes=500, experiment_dir=experiment_dir)


Populating replay memory...


I0000 00:00:1741792252.474619   79545 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13887 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Ti SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


Replay memory initialized.
Episode 1, Reward: -65448779.89999988, Epsilon: 0.9191
Episode 2, Reward: -69677131.39999998, Epsilon: 0.9122
Episode 3, Reward: -61099517.39999989, Epsilon: 0.9358
Episode 4, Reward: -53728066.5999999, Epsilon: 0.9366
Episode 5, Reward: -69008350.99999999, Epsilon: 0.9236
Episode 6, Reward: -92054982.29999998, Epsilon: 0.9164
Episode 7, Reward: -98752862.20000017, Epsilon: 0.9041
Episode 8, Reward: -82156671.80000009, Epsilon: 0.9356
Episode 9, Reward: -120571901.5, Epsilon: 0.8289
Episode 10, Reward: -57627766.400000006, Epsilon: 0.9376
Episode 11, Reward: -79249763.8999998, Epsilon: 0.9352
Episode 12, Reward: -63061475.90000021, Epsilon: 0.9291
Episode 13, Reward: -79257507.20000015, Epsilon: 0.9120
Episode 14, Reward: -90461088.49999987, Epsilon: 0.8940
Episode 15, Reward: -61292681.69999991, Epsilon: 0.9148
Episode 16, Reward: -90551860.39999986, Epsilon: 0.9076
Episode 17, Reward: -74181396.00000016, Epsilon: 0.9222
Episode 18, Reward: -48337520.4999998

KeyboardInterrupt: 

In [None]:
def test_trained_model(env, q_network, state_processor, num_episodes=10):
    """
    Runs the trained agent in the environment without exploration (ε = 0).

    Args:
        env: The environment to test in.
        q_network: The trained Q-Network.
        state_processor: Processes environment states into model-compatible format.
        num_episodes: Number of episodes to test.

    Returns:
        A list of total rewards for each episode.
    """
    total_rewards = []

    for episode in range(num_episodes):
        state_quantities, state_products_onehot = env.reset()
        state = state_processor.process(state_quantities, state_products_onehot)

        total_reward = 0
        done = False
        step_count = 0

        while not done:
            # Get action from trained model (greedy policy, no exploration)
            q_values = q_network(tf.expand_dims(state, axis=0))[0].numpy()
            action = np.argmax(q_values)  # Choose best action

            # Convert action index to (zone, anaquel, row, col)
            zone, anaquel, row, col = np.unravel_index(action, (env.zones, env.anaqueles_per_zone, env.rows, env.cols))

            # Take step in the environment
            next_state_quantities, next_state_products_onehot, reward, done = env.step((zone, anaquel, row, col))
            next_state = state_processor.process(next_state_quantities, next_state_products_onehot)

            state = next_state
            total_reward += reward
            step_count += 1

        total_rewards.append(total_reward)
        print(f"Test Episode {episode+1}: Total Reward = {total_reward}")

    return total_rewards


In [None]:
test_rewards = test_trained_model(env, q_network, state_processor, num_episodes=10)
print("Average Reward over 10 Episodes:", np.mean(test_rewards))
