# Tratamento dos dados

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv

import numpy as np
import pandas as pd
import btgsolutions_dataservices as btg

load_dotenv()
BTG_API_KEY = os.getenv("BTG_API_KEY")
bulk = btg.BulkData(api_key=BTG_API_KEY)

def _filter_session(df: pd.DataFrame, date: str, time_col: str) -> pd.DataFrame:
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col], utc=True)
    start = pd.to_datetime(f"{date} 13:00:00").tz_localize("UTC")
    end = pd.to_datetime(f"{date} 20:50:00").tz_localize("UTC")
    return df[df[time_col].between(start, end)]


def get_trade_data(ticker: str, year: str, month: str, day: str) -> pd.DataFrame:
    date = f"{year}-{month}-{day}"
    cols = ["symbol", "rpt_seq", "network_received_time", "md_entry_px",
            "md_entry_size", "aggressor"]
    df = bulk.get_data(ticker, date, data_type="trades")[cols]
    df = _filter_session(df, date, "network_received_time")
    df = df.drop_duplicates(["md_entry_px", "md_entry_size", "aggressor"])
    return df


def get_book_data(ticker: str, year: str, month: str, day: str) -> pd.DataFrame:
    date = f"{year}-{month}-{day}"
    cols = ["symbol", "rpt_seq", "network_received_time", "msg_type", "en_b_px_1", "en_s_px_1"]
    df = bulk.get_data(ticker, date, data_type="books")[cols]
    df = df[df["msg_type"] == "INCREMENTAL"]
    df = _filter_session(df, date, "network_received_time")
    df = df[(df["en_b_px_1"] != 0) & (df["en_s_px_1"] != 0)]
    df = df.drop_duplicates(["en_b_px_1", "en_s_px_1"])
    df = df.drop(columns=["msg_type"])
    return df


def merge_book_trade(book_df: pd.DataFrame, trade_df: pd.DataFrame) -> pd.DataFrame:
    trade_df = trade_df.rename(columns={"symbol": "symbol_trade",
                                        "network_received_time": "network_received_time_trade"})
    book_df = book_df.rename(columns={"symbol": "symbol_book",
                                      "network_received_time": "network_received_time_book"})

    df = pd.merge(trade_df, book_df, on="rpt_seq", how="outer")

    df["symbol"] = df.pop("symbol_trade").combine_first(df.pop("symbol_book"))
    df["network_received_time"] = df.pop("network_received_time_trade").combine_first(
        df.pop("network_received_time_book")
    )
    df["network_received_time"] = pd.to_datetime(df["network_received_time"], utc=True)

    df = df.sort_values(["network_received_time", "rpt_seq"]).reset_index(drop=True)

    has_trade = df["md_entry_px"].notna() if "md_entry_px" in df else pd.Series(False, index=df.index)
    has_book = pd.Series(False, index=df.index)
    if "en_b_px_1" in df:
        has_book = has_book | df["en_b_px_1"].notna()
    if "en_s_px_1" in df:
        has_book = has_book | df["en_s_px_1"].notna()

    df["event_type"] = np.select(
        [has_trade, has_book],
        ["trade", "book"],
        default="unknown"
    )

    first_book_idx = df.index[df["event_type"] == "book"]
    if not first_book_idx.empty:
        df = df.loc[first_book_idx[0]:].reset_index(drop=True)

    if "en_b_px_1" in df:
        df["en_b_px_1"] = df["en_b_px_1"].ffill()
    if "en_s_px_1" in df:
        df["en_s_px_1"] = df["en_s_px_1"].ffill()

    numeric_cols = [
        "md_entry_px", "md_entry_size", "aggressor",
        "en_b_px_1", "en_s_px_1"
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    ordered_cols = [
        "symbol",
        "rpt_seq",
        "network_received_time",
        "event_type",
        "md_entry_px",
        "md_entry_size",
        "aggressor",
        "en_s_px_1",
        "en_b_px_1",
    ]
    ordered_cols = [c for c in ordered_cols if c in df.columns]
    remaining = [c for c in df.columns if c not in ordered_cols]

    df = df[ordered_cols + remaining]
    return df


def save_df(df: pd.DataFrame, ticker: str, year: str, month: str, day: str, out_dir: str = "train_data", index: int = None) -> None:
    Path(out_dir).mkdir(exist_ok=True)
    if index is not None:
        out = Path(out_dir) / f"{index}_{ticker}_{day}{month}{year}.csv"
    else:
        out = Path(out_dir) / f"{ticker}_{day}{month}{year}.csv"
    df.to_csv(out, index=False)
    print(f"âœ” salvo {out} ({len(df)} linhas)")


def prepare_data(ticker: str, year: str, month: str, day: str, out_dir: str, index: int = None) -> None:
    try:
        book_df = get_book_data(ticker, year, month, day)
        trade_df = get_trade_data(ticker, year, month, day)
        df = merge_book_trade(book_df, trade_df)
        save_df(df, ticker, year, month, day, out_dir, index)
    except Exception as e:
        print(f"Erro {ticker} {year}-{month}-{day}: {e}")


Get train data from PETR4

In [None]:
ticker = "PETR4"
year = "2025"
month = "03"
# day = "05"
# prepare_data(ticker, year, month, day, out_dir="train_data")

for day in range(1, 32):
    print(f"Processing {ticker} for {year}-{month}-{day}")
    try:
        prepare_data(ticker, year, month, str(day).zfill(2), out_dir="train_data")
    except Exception as e:
        print(f"Error processing {ticker} on {year}-{month}-{day}: {e}")

Get test data

In [None]:
import requests
import numpy as np

auth_url = "https://dataservices.btgpactualsolutions.com/api/v2/authenticate"
auth_headers = {
    "Content-Type": "application/json",
}
auth_body = {
    "api_key": BTG_API_KEY,
    "client_id": "string"
}

liquidity_url = "https://dataservices.btgpactualsolutions.com/api/v1/marketdata/analytics/top-n-liquidity?n="
liquidity_headers = {
    "Accept": "application/json",
    "Authorization": "Bearer "
}

def get_auth_token():
    response = requests.post(auth_url, json=auth_body, headers=auth_headers)
    if response.status_code == 200:
        return response.json().get("AccessToken")
    else:
        raise Exception(f"Failed to authenticate: {response.status_code} {response.text}")
    
def get_top_n_liquidiyty_tickers(token, n=15):
    url = liquidity_url + str(n)
    liquidity_headers["Authorization"] = f"Bearer {token}"
    response = requests.get(url, headers=liquidity_headers)
    if response.status_code == 200:
        data = response.json().get("result")
        return data
    else:
        raise Exception(f"Failed to fetch liquidity data: {response.status_code} {response.text}")

def filter_top_liquidity_tickers(data):
    if len(data) <= 15:
        return [item['ticker'] for item in data]
    top_5 = data[:5]
    bottom_5 = data[-5:]
    middle_5 = np.random.choice(data[5:-5], 5, replace=False)
    selected = []
    selected.extend(top_5)
    selected.extend(middle_5.tolist())
    selected.extend(bottom_5)
    return selected

In [None]:
token = get_auth_token()
all_tickers = get_top_n_liquidiyty_tickers(token, 200)
all_tickers

In [None]:

tickers = [
 'VALE3',
 'PETR4',
 'ITUB4',
 'BBAS3',
 'BBDC4',
 'ETHE11',
 'RECV3',
 'BIAU39',
 'ENEV3',
 'MDIA3',
 'CMIG4',
 'AMBP3',
 'TUPY3',
 'XPML11',
 'P2LT34',
]
 

In [None]:
year = "2025"
month = "04"
day = "11"

for day in range(7, 8):
    count = 0
    for index, ticker in enumerate(tickers):
        print(f"Processing {ticker} for {year}-{month}-{day}")
        try:
            prepare_data(ticker, year, month, str(day).zfill(2), out_dir="test_data", index=index+1)
            count += 1
        except Exception as e:
            print(f"Error processing {ticker} on {year}-{month}-{day}: {e}")
            
print(f"{count} deram certo")
