In [4]:
import sys
import subprocess
import pkg_resources
import os
import pandas as pd
sys.path.append(os.path.abspath('../scripts'))

from myFunctions import install_packages, save_table 
install_packages()


Installing required packages: ['numpy', 'pandas', 'scikit-learn', 'joblib', 'pyarrow', 'fastparquet', 'plotly', 'matplotlib', 'MetaTrader5', 'tabulate']
numpy is already installed.
pandas is already installed.
scikit-learn is already installed.
joblib is already installed.
pyarrow is already installed.
fastparquet is already installed.
plotly is already installed.
matplotlib is already installed.
MetaTrader5 is already installed.
tabulate is already installed.
All packages are verified.


In [29]:
input_target = os.path.join('..', 'data', 'target').replace("/", "\\")
input_features = os.path.join('..', 'data', 'features').replace("/", "\\")
output_dir = os.path.join('..', 'data', 'processed_data').replace("/", "\\")


In [30]:
df_features = pd.read_parquet(os.path.join(input_features, 'features.parquet'))
df_daily_target = pd.read_parquet(os.path.join(input_target, 'daily_target.parquet'))
df_timestamp_target = pd.read_parquet(os.path.join(input_target, 'timestamp_target.parquet'))
df_daily = df_features.copy()
df_timestamp = df_features.copy()


In [31]:
print(df_timestamp.shape)
print(df_timestamp_target.shape)
df_timestamp = df_timestamp.merge(
    df_timestamp_target[['time', 'close_price_target', 'open_price_target', 'behavior_target']],
    on='time',
    how='inner'
)
print(df_timestamp.shape)

(22356, 169)
(22356, 6)
(22356, 172)


In [32]:
df_timestamp_target.isna().sum()

time                  0
open_BGI$             0
close_BGI$            0
close_price_target    1
open_price_target     1
behavior_target       0
dtype: int64

In [33]:
df_timestamp.dropna(inplace=True)

In [34]:
df_daily.head()

Unnamed: 0,time,open_AGFS,high_AGFS,low_AGFS,close_AGFS,tick_volume_AGFS,spread_AGFS,real_volume_AGFS,open_BGI$,high_BGI$,...,EMA55_DOL$,EMA9_IVVB11,EMA21_IVVB11,EMA55_IVVB11,EMA9_IBOV,EMA21_IBOV,EMA55_IBOV,EMA9_CCM$,EMA21_CCM$,EMA55_CCM$
96,2022-06-02 09:00:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,320.52,321.21,...,5564.905769,216.192547,215.888976,215.465756,111438.76577,111480.083554,111455.854537,121.403192,121.543033,122.110865
97,2022-06-02 09:15:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,5565.471777,216.204037,215.921797,215.493764,111423.012616,111469.166867,111452.431161,121.600554,121.62003,122.120834
98,2022-06-02 09:30:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,5565.502035,216.21323,215.951633,215.520773,111410.410093,111459.242606,111449.130048,121.700443,121.663663,122.12009
99,2022-06-02 09:45:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.98,319.98,...,5565.63432,216.220584,215.978758,215.546817,111400.328074,111450.220551,111445.946832,121.672354,121.654239,122.100087
100,2022-06-02 10:00:00,1764.0,1778.0,1763.0,1777.0,9986.0,0.0,6702403.0,319.39,320.57,...,5566.071201,215.994467,215.897961,215.530502,111577.062459,111526.018683,111475.877302,121.665883,121.652945,122.083655


In [35]:
def pivot_data(df, target=None):
    """
    Pivot a DataFrame by hour for each day, excluding columns containing the target keyword.

    Parameters:
    df (pd.DataFrame): Input DataFrame with a 'time' column containing datetime values.
    target (str, optional): Keyword to exclude columns containing this value from the pivot. Default is None.

    Returns:
    pd.DataFrame: Pivoted DataFrame with columns for each hour of the day and variables suffixed with '_hhmm'.

    Example:
    >>> data = {
    ...     'time': ['2022-06-02 09:00:00', '2022-06-02 09:15:00', '2022-06-02 09:30:00'],
    ...     'open_BGI$': [313.94, 313.17, 312.85],
    ...     'close_BGI$': [313.75, 312.73, 313.00],
    ... }
    >>> df = pd.DataFrame(data)
    >>> df['time'] = pd.to_datetime(df['time'])
    >>> print(pivoted)
    date       open_BGI$_0900  open_BGI$_0915  open_BGI$_0930  close_BGI$_0900  close_BGI$_0915  close_BGI$_0930
    2022-06-02        313.94          313.17          312.85           313.75          312.73          313.00
    """
    df['date'] = pd.to_datetime(df['time']).dt.date
    df['hour'] = pd.to_datetime(df['time']).dt.strftime('%H%M')

    columns_to_pivot = [col for col in df.columns if col not in ['time', 'date', 'hour']]
    pivoted = df.pivot(index='date', columns='hour', values=columns_to_pivot)
    pivoted.columns = [f"{col}_{hour}" for col, hour in pivoted.columns]

    pivoted.reset_index(inplace=True)
    return pivoted


In [36]:
df_daily.columns

Index(['time', 'open_AGFS', 'high_AGFS', 'low_AGFS', 'close_AGFS',
       'tick_volume_AGFS', 'spread_AGFS', 'real_volume_AGFS', 'open_BGI$',
       'high_BGI$',
       ...
       'EMA55_DOL$', 'EMA9_IVVB11', 'EMA21_IVVB11', 'EMA55_IVVB11',
       'EMA9_IBOV', 'EMA21_IBOV', 'EMA55_IBOV', 'EMA9_CCM$', 'EMA21_CCM$',
       'EMA55_CCM$'],
      dtype='object', length=169)

In [37]:
df_daily =  pivot_data(df_daily.copy())


In [38]:
df_daily.columns

Index(['date', 'open_AGFS_0900', 'open_AGFS_0915', 'open_AGFS_0930',
       'open_AGFS_0945', 'open_AGFS_1000', 'open_AGFS_1015', 'open_AGFS_1030',
       'open_AGFS_1045', 'open_AGFS_1100',
       ...
       'EMA55_CCM$_1530', 'EMA55_CCM$_1545', 'EMA55_CCM$_1600',
       'EMA55_CCM$_1615', 'EMA55_CCM$_1630', 'EMA55_CCM$_1645',
       'EMA55_CCM$_1700', 'EMA55_CCM$_1715', 'EMA55_CCM$_1730',
       'EMA55_CCM$_1745'],
      dtype='object', length=6049)

In [39]:
df_daily_target.reset_index(inplace=True)

In [40]:
df_daily_target.columns

Index(['day', 'open_BGI$', 'close_BGI$', 'close_price_target',
       'open_price_target', 'behavior_target'],
      dtype='object')

In [41]:
df_daily = df_daily.merge(
    df_daily_target[['day', 'close_price_target', 'open_price_target', 'behavior_target']],
    left_on='date', right_on='day', how='inner'
)

In [None]:
df_daily.dropna(inplace=True)

Unnamed: 0,date,open_AGFS_0900,open_AGFS_0915,open_AGFS_0930,open_AGFS_0945,open_AGFS_1000,open_AGFS_1015,open_AGFS_1030,open_AGFS_1045,open_AGFS_1100,...,EMA55_CCM$_1630,EMA55_CCM$_1645,EMA55_CCM$_1700,EMA55_CCM$_1715,EMA55_CCM$_1730,EMA55_CCM$_1745,day,close_price_target,open_price_target,behavior_target
0,2022-06-02,1767.0,1767.0,1767.0,1767.0,1764.0,1776.0,1778.0,1778.0,1766.0,...,121.396625,121.369960,121.344247,121.319452,121.295543,121.272488,2022-06-02,332.23,325.07,1
1,2022-06-03,1778.0,1778.0,1778.0,1778.0,1779.0,1766.0,1767.0,1762.0,1768.0,...,120.530506,120.508703,120.487678,120.467403,120.447853,120.429001,2022-06-03,331.88,331.88,0
2,2022-06-06,1753.0,1753.0,1753.0,1753.0,1752.0,1752.0,1753.0,1752.0,1750.0,...,120.854125,120.851834,120.849626,120.847497,120.845443,120.843463,2022-06-06,330.89,331.49,0
3,2022-06-07,1736.0,1736.0,1736.0,1736.0,1735.0,1723.0,1720.0,1720.0,1716.0,...,122.418111,122.503892,122.586611,122.666375,122.743290,122.817458,2022-06-07,330.80,331.88,0
4,2022-06-08,1712.0,1712.0,1712.0,1712.0,1713.0,1704.0,1707.0,1715.0,1716.0,...,124.106374,124.088289,124.070850,124.054034,124.037818,124.022182,2022-06-08,332.82,331.34,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616,2024-11-14,1618.0,1618.0,1618.0,1618.0,1618.0,1628.0,1616.0,1620.0,1624.0,...,74.525913,74.526059,74.526200,74.526336,74.526466,74.526593,2024-11-14,351.26,346.77,1
617,2024-11-18,1617.0,1617.0,1617.0,1617.0,1615.0,1609.0,1613.0,1609.0,1613.0,...,74.314056,74.293196,74.273082,74.253687,74.234983,74.216948,2024-11-18,347.02,351.82,0
618,2024-11-19,1601.0,1601.0,1601.0,1601.0,1601.0,1603.0,1600.0,1603.0,1606.0,...,73.356817,73.343359,73.330382,73.317869,73.305802,73.294166,2024-11-19,350.96,350.09,1
619,2024-11-21,1604.0,1604.0,1604.0,1604.0,1603.0,1594.0,1593.0,1596.0,1594.0,...,72.771635,72.757291,72.743459,72.730122,72.717260,72.704858,2024-11-21,357.34,352.74,1


In [43]:
save_table(df_daily.head(6), title = 'Exemplo do Target diário para o fechamento, abertura e comportamento do mercado')
save_table(df_timestamp.head(6), title = 'Exemplo do Target timestamp para o fechamento, abertura e comportamento do mercado')

Table saved as CSV: ..\results\tables\csv\Tabela_2_Exemplo do Target diário para o fechamento, abertura e comportamento do mercado.csv
Table saved as CSV: ..\results\tables\csv\Tabela_3_Exemplo do Target timestamp para o fechamento, abertura e comportamento do mercado.csv


In [45]:
os.makedirs(output_dir, exist_ok=True)
df_daily_path = os.path.join(output_dir, 'df_daily.parquet').replace("/", "\\")
df_timestamp_path = os.path.join(output_dir, 'df_timestamp.parquet').replace("/", "\\")

# Salvando os DataFrames
df_daily.to_parquet(df_daily_path)
df_timestamp.to_parquet(df_timestamp_path)

# Print para confirmar os caminhos
print(f"Data saved to: {df_daily_path}")
print(f"Data saved to: {df_timestamp_path}")

Data saved to: ..\data\processed_data\df_daily.parquet
Data saved to: ..\data\processed_data\df_timestamp.parquet
