In [None]:
!{sys.executable} -m pip install stable-baselines3
!{sys.executable} -m pip install shimmy>=0.2.1
!{sys.executable} -m pip install gymnasium

In [None]:
import tensorflow as tf
import warnings
import numpy as np
import pickle
import random
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
from tqdm import tqdm
from gym import Env
from gym import spaces
from collections import deque
from tabulate import tabulate
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
with open('dataframes_train.txt', 'rb') as f:
    dataframes_train = pickle.load(f)

In [None]:
with open('dataframes_test.txt', 'rb') as f:
    dataframes_test = pickle.load(f)

# Ambiente

In [None]:
ob_data = [
    "qtd disponivel de venda",
    "qtd disponivel de compra",
    "delta preco ideal - atual",
    "qtd casada",
]

In [None]:
class AuxFunctions():
  """
  Class with auxiliary functions used in the environment
  """
  def __init__(self):
    pass

  def calculate_cdi(self, sale_price, purchase_price, du, di):
    """Calculates the CDI for the combined purchases
        Args:
            sale_price (float): The price of the sale
            purchase_price (float): The average price of the combined purchases
            du (int): The number of business days between the sale operation date and the expiration date
            di (float): The DI tax of the day

        Returns:
            float: The CDI for the combined purchases
    """
    if purchase_price == 0:
      return 0
    rent = ((sale_price / purchase_price) - 1)
    annual_rent = ((1 + rent) ** (252 / du)) - 1

    cdi = (annual_rent / di) * 100

    return cdi

  def calculate_ideal_price(self, sale_price, du, di):
    """Calculates the ideal average price for the combined purchases
        Uses the formula: pi=pv/(di+1)^du/252
        where:
        pi = ideal price
        pv = sale price
        du = business days
        di = DI tax of the day

      Args:
          sale_price (float): the mean price of the sale operation
          du (int): business days
          di (float): DI tax of the day

      Returns:
          float: ideal average price for the combined purchases
      """
    return sale_price / ((di + 1) ** (du / 252))

  def calculate_average_price(self, quantities, purchases):
    """Calculates the current average price for the combined purchases

      Args:
        quantities (list): an array with the combined quantity for each purchase
        purchases (dataframe): all purchases for a specific sale

      Returns:
        float: The new average price for the combined purchases
    """
    total_price = 0
    total_quantity = 0

    for purchase_index in range(len(quantities)):
        total_price += purchases.iloc[purchase_index]["Preço"] * quantities[purchase_index]
        total_quantity += quantities[purchase_index]

    if total_quantity == 0:
      return 0
    return total_price / total_quantity

  def update_dataframes(self, current_sale_index, original_dataframes, dataframes, purchases_quantities):
    """
    Updates the purchases quantities for the sales that share the same purchases as the one combined in the moment.

    Args:
        current_sale_index (int): The index of the sale combined in the moment
        original_dataframes (list): The original dataframes with all the sales and respective purchases
        dataframes (list): The dataframes updated with all combinations already made
        purchases_quantities (list): The quantities of the combined purchases

    Returns:
        dataframes (list): The dataframes updated with the new quantities of the combined purchases
    """
    current_sale = dataframes[current_sale_index]['sale']

    # Check if there are other sales with the same purchases before the current sale
    first_index = current_sale_index
    for sale_index in range(current_sale_index, 0, -1):
      sale = original_dataframes[sale_index]['sale']

      criteria = (sale['Cód. Cliente'] == current_sale['Cód. Cliente'] and
                  sale['Dt. Operação'] == current_sale['Dt. Operação'] and
                  sale['Cód. Título'] == current_sale['Cód. Título'] and
                  sale['Cód. Corretora'] == current_sale['Cód. Corretora'])

      if not criteria:
          break
      else:
        first_index -= 1


    # Check if there are other sales with the same purchases after the current sale
    for sale_index in range(first_index + 1, len(dataframes)):
        sale = original_dataframes[sale_index]['sale']

        criteria = (sale['Cód. Cliente'] == current_sale['Cód. Cliente'] and
                    sale['Dt. Operação'] == current_sale['Dt. Operação'] and
                    sale['Cód. Título'] == current_sale['Cód. Título'] and
                    sale['Cód. Corretora'] == current_sale['Cód. Corretora'])

        if not criteria:
            break

        purchases = dataframes[sale_index]['purchase']
        for i in range(len(purchases_quantities)):
          purchases.iloc[i, purchases.columns.get_loc("Quantidade")] -= purchases_quantities[i]

    return dataframes


In [None]:
from copy import deepcopy
class Matching(Env):
  """ Implements the environment for an RL agent to learn how to combine purchases and sales to maximize the CDI around 100%

  Args:
      Env (object): OpenAI Gym environment as base class
  """
  def __init__(self, dataframes):
    self.actions = [1, 0.5, 0, -0.5, -1]
    self.action_space = spaces.Discrete(len(self.actions))
    self.observation_space = spaces.Box(0, np.inf, shape=(len(ob_data),))
    self.utils = AuxFunctions()
    self.original_data = deepcopy(dataframes)
    self.data = deepcopy(dataframes)
    self.all_quantities_matched = [np.zeros(x['purchase'].shape[0]) for x in self.original_data]
    self.du = 0
    self.di = 0
    self.current_episode = 0
    self.cdis = np.zeros(len(self.data))
    self.state = None
    self.combinations = {}
    self.none_actions = 0
    self.max_none_actions = 3

  def update_data(self, action):
      """Updates the quantities for the purchases and sales based on the agent's action

      Args:
          action (int): The percentage of the purchase quantity to be combined or discombined (positive or negative respectively)

      Returns:
          int: The actual quantity that can be combined or discombined
      """
      current_purchase_index = self.current_step % self.purchases.shape[0]

      if action < 0:
        quantity_of_purchase = int(action * self.all_quantities_matched[self.current_episode][current_purchase_index])
      else:
        quantity_of_purchase = int(action * self.purchases.iloc[current_purchase_index]['Quantidade'])


      if quantity_of_purchase < 0:
        if abs(quantity_of_purchase) > self.all_quantities_matched[self.current_episode][current_purchase_index]:
          quantity_of_purchase = -1 * self.all_quantities_matched[self.current_episode][current_purchase_index]

      if quantity_of_purchase - self.sale["Quantidade"] > 0:
        quantity_of_purchase = self.sale["Quantidade"]

      self.all_quantities_matched[self.current_episode][current_purchase_index] += quantity_of_purchase

      self.sale["Quantidade"] = self.sale["Quantidade"] - quantity_of_purchase
      self.purchases.iloc[current_purchase_index, self.purchases.columns.get_loc("Quantidade")] -= quantity_of_purchase
      return quantity_of_purchase

  def step(self, action):
    """Executes the action and updates the environment state by updating the quantities and prices of the purchases and sales, and calculating the reward and CDI
        The stop conditions are if the total sale quantity was used, or if the maximum number of steps is reached

    Args:
        action (_type_): The index of the action to be executed

    Returns:
        state (list): The new state of the environment
        reward (int): The reward for the agent
        done (bool): Whether the episode is done or not
        info (dict): Additional information for debugging
    """
    done = False
    reward = 0
    quantity = self.update_data(self.actions[action])

    if quantity == 0:
      self.none_actions += 1
    else:
      self.none_actions = 0

    if self.combinations.get(self.current_episode) is not None and self.combinations[self.current_episode].size > 0:
      self.all_quantities_matched[self.current_episode] = deepcopy(self.combinations[self.current_episode])

    current_average_price = self.utils.calculate_average_price(self.all_quantities_matched[self.current_episode], self.purchases)

    old_distance = self.state[2]

    self.state = self.get_state()

    # Reward based on how close the average price is to the ideal price compared to the previous state
    new_distance = self.state[2]
    if new_distance < old_distance and current_average_price <= self.sale["Preço"]:
      if new_distance != 0:
        reward +=  500 / new_distance
    else:
      reward -= 500

    # Reward based on the agent doing nothing for a long time
    if self.none_actions == self.max_none_actions:
      reward -= 250

    # Reward based on the agent using all the sale quantity and the CDI being close to 100%
    # If the CDI is between 95% and 110%, the agent receives a high reward
    # If the CDI is below 0, the agent receives a high negative reward
    # If the CDI is not close to 100%, the agent receives a reward inversely proportional to the distance from 100%
    if self.sale_quantity == 0:
      done = True
      if 95 <= self.current_cdi <= 110:
        self.combinations[self.current_episode] = deepcopy(self.all_quantities_matched[self.current_episode])
        reward += 100000
      elif self.current_cdi < 0:
        reward += -10000
      else:
        reward += 1000 / abs(self.current_cdi - 100)

    elif self.current_step >= self.max_steps:
      reward -= 1000
      done = True

    if done:
      self.render()
      self.cdis[self.current_episode] = self.current_cdi
      self.data = self.utils.update_dataframes(self.current_episode, self.original_data, self.data, self.all_quantities_matched[self.current_episode])
      self.all_quantities_matched = [np.zeros(x['purchase'].shape[0]) for x in self.original_data]
      self.data[self.current_episode] = deepcopy(self.original_data[self.current_episode])

    self.current_step += 1
    return self.state, reward, done, {}

  def reset(self):
    """Resets the environment to the initial state, wich means going to the next sale (chosen randomly)

    Returns:
        state (list): The initial state of the environment
    """
    self.none_actions = 0
    self.current_episode = np.random.randint(0, len(self.data))
    self.current_step = 0
    self.sale = self.data[self.current_episode]["sale"]
    self.purchases = self.data[self.current_episode]["purchase"]
    self.all_quantities = [np.zeros(x['purchase'].shape[0]) for x in self.original_data]
    return self.get_state()

  def get_state(self):
    """ Returns the observation data for the current state. It is updated on each step of the environment and episode
        It returns the sale sale quantity, purchase quantity, the ideal price and average purchase price delta, and ideal price

        Returns:
            state (list): The observation data for the current state
    """
    self.sale_quantity = self.sale["Quantidade"]
    self.di = self.sale['DI']
    self.du = self.sale['du']

    purchase_index = self.current_step % self.purchases.shape[0]
    self.purchase_quantity = self.purchases.iloc[purchase_index]["Quantidade"]

    self.max_steps = self.purchases.shape[0] * 5

    self.ideal_price = self.utils.calculate_ideal_price(self.sale["Preço"], self.du, self.di)

    self.purchase_average_price = self.utils.calculate_average_price(self.all_quantities_matched[self.current_episode], self.purchases)

    self.current_cdi = self.utils.calculate_cdi(self.sale["Preço"], self.purchase_average_price, self.du, self.di)
    purchase_quantity_matched = self.all_quantities_matched[self.current_episode][purchase_index]
    self.state = [self.sale_quantity, self.purchase_quantity, abs(self.ideal_price - self.purchase_average_price), purchase_quantity_matched]
    return self.state

  def render(self):
    """
    Prints the current CDI, sale initial quantity, remaining sale quantity, and purchases matched quantity for each episode
    """
    headers = ["Description", "Value"]
    data = [
        ("Current CDI", self.current_cdi),
        ("Sale Initial Quantity", self.original_data[self.current_episode]['sale']["Quantidade"]),
        ("Remaining Sale Quantity", self.data[self.current_episode]['sale']["Quantidade"]),
        ("Purchases Matched Quantity", sum(self.all_quantities_matched[self.current_episode]))
    ]

    print(tabulate(data, headers=headers, tablefmt="grid"))

# Agente

In [None]:
class Agent():
  """ Implements the DQN agent to learn how to combine purchases and sales to maximize the CDI around 100%
  """
  def __init__(self,
               train_env,
               test_env,
               gamma=0.9,
               epsilon=1.0,
               epsilon_decay=0.994,
               epsilon_min=0.1,
               buffer_size=2000,
               lr=1e-6,
               batch_size=32):
    """
    Args:
        train_env (object): The environment for training the agent
        test_env (object): The environment for testing the agent
        gamma (float): The discount factor
        epsilon (float): The exploration rate
        epsilon_decay (float): The rate at which the exploration rate decays
        epsilon_min (float): The minimum exploration rate
        buffer_size (int): The size of the replay buffer
        lr (float): The learning rate for the agent
        batch_size (int): The size of the batch used for training the agent
    """
    self.train_env = train_env
    self.test_env = test_env
    self.input_dim = self.train_env.observation_space.shape[0]
    self.output_dim = self.train_env.action_space.n
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.epsilon_min = epsilon_min
    self.buffer_size = buffer_size
    self.lr = lr
    self.batch_size = batch_size
    self.model = self.build_model()

  def train(self, episodes):
    """
    Trains the agent for a given number of episodes using the DQN algorithm

    Args:
        episodes (int): The number of episodes to train the agent

    Returns:
        cdis (list): The CDIs for each episode
    """
    memory = deque(maxlen=self.buffer_size)
    local_epsilon = self.epsilon
    for e in range(episodes):
        episode_reward = 0
        state = self.train_env.reset()
        done = False
        while not done:
          # Epsilon-greedy policy (exploration and exploitation)
          if np.random.rand() < local_epsilon:
              action = self.train_env.action_space.sample()
          else:
            q_values = self.model.predict(np.array([next_state]), verbose=0)[0]
            action = np.argmax(q_values)
          next_state, reward, done, _ = self.train_env.step(action)
          memory.append((state, action, reward, next_state, done))
          state = next_state
          episode_reward += reward

        # Update the model using the replay buffer
        if len(memory) >= self.batch_size:
          batch_sample = random.sample(memory, self.batch_size)

          for state, action, reward, next_state, done in batch_sample:
              target = reward
              # Q-learning update rule (Bellman equation)
              # Q(s, a) = r + γ * max(Q(s', a'))
              if not done:
                  target += self.gamma * np.amax(self.model.predict(np.array([next_state]), verbose=0)[0])
              target_f = self.model.predict(np.array([next_state]), verbose=0)
              target_f[0][action] = target
              self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

        print(f"Episode: {e + 1}, Reward: {episode_reward}")
        local_epsilon = max(self.epsilon_min, self.epsilon_decay*local_epsilon)
    return self.train_env.cdis

  def evaluate(self, num_runs):
    """
    Evaluates the agent for a given number of runs using the trained model

    Args:
        num_runs (int): The number of runs to evaluate the agent

    Returns:
        cdis (list): The CDIs for each run
    """
    for s in tqdm(range(num_runs), desc="Evaluating", unit="run"):
      episode_reward = 0
      state = self.test_env.reset()
      done = False
      while not done:
        q_values = self.model.predict(np.array([state]), verbose=0)[0]
        action = np.argmax(q_values)
        next_state, reward, done, _ = self.test_env.step(action)
        state = next_state
        episode_reward += reward
      print(f"Episode: {s + 1}, Reward: {episode_reward}")
    return self.test_env.cdis

  def build_model(self):
    """
    Builds the DQN model using a neural network with 3 hidden layers and ReLU activation functions
    """
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_dim=self.input_dim, activation='relu'))
    model.add(keras.layers.Dense(24, activation='relu'))
    model.add(keras.layers.Dense(24, activation='relu'))
    model.add(keras.layers.Dense(self.output_dim, activation='linear'))
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=self.lr), loss='mse')
    return model

  def save_model(self, name):
    """
    Saves the trained model to a file
    """
    self.model.save(name)

  def load_model(self, name):
    """
    Loads a trained model from a file
    """
    self.model = tf.keras.models.load_model(name)

  def plot_cdis(self, cdis):
    """
    Plots the distribution of CDIs
    """

    # Get only the non-zero CDIs
    calculated_cdis = []
    for i in range(len(cdis)):
      if cdis[i] != 0:
        calculated_cdis.append(cdis[i])

    # Plot the distribution of CDIs
    plt.figure(figsize=(10, 6))
    plt.hist(calculated_cdis, bins=50, edgecolor='k', alpha=0.7)
    plt.axvline(x=80, color='r', linestyle='--', linewidth=2, label='DI = 80')
    plt.axvline(x=100, color='g', linestyle='--', linewidth=2, label='DI = 100')
    plt.axvline(x=120, color='b', linestyle='--', linewidth=2, label='DI = 120')
    plt.title('Distribution of CDIs')
    plt.xlabel('CDI Values')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
train_env = Matching(dataframes_train)
test_env = Matching(dataframes_test)
agent = Agent(train_env, test_env)

In [None]:
episodes = 1000
cdis = agent.train(episodes)
agent.plot_cdis(cdis)
agent.save_model("../data/model.h5")

In [None]:
agent.load_model("../data/model.h5")
runs = 500
cdis = agent.evaluate(runs)
agent.plot_cdis(cdis)