In [1]:
import sys
import subprocess
import pkg_resources
import os
import pandas as pd
sys.path.append(os.path.abspath('../scripts'))

from myFunctions import install_packages, save_table 
install_packages()


Installing required packages: ['numpy', 'pandas', 'scikit-learn', 'joblib', 'pyarrow', 'fastparquet', 'plotly', 'matplotlib', 'seaborn', 'MetaTrader5', 'tabulate', 'optuna', 'torch', 'tqdm', 'shap', 'kaleido', 'statsmodels', 'tqdm']
numpy is already installed.
pandas is already installed.
scikit-learn is already installed.
joblib is already installed.
pyarrow is already installed.
fastparquet is already installed.
plotly is already installed.
matplotlib is already installed.
seaborn is already installed.
MetaTrader5 is already installed.
tabulate is already installed.
optuna is already installed.
torch is already installed.
tqdm is already installed.
shap is already installed.
kaleido is already installed.
statsmodels is already installed.
tqdm is already installed.
All packages are verified.


In [2]:
def pivot_data(df, target=None):
    """
    Pivot a DataFrame by hour for each day, excluding columns containing the target keyword.

    Parameters:
    df (pd.DataFrame): Input DataFrame with a 'time' column containing datetime values.
    target (str, optional): Keyword to exclude columns containing this value from the pivot. Default is None.

    Returns:
    pd.DataFrame: Pivoted DataFrame with columns for each hour of the day and variables suffixed with '_hhmm'.

    Example:
    >>> data = {
    ...     'time': ['2022-06-02 09:00:00', '2022-06-02 09:15:00', '2022-06-02 09:30:00'],
    ...     'open_BGI$': [313.94, 313.17, 312.85],
    ...     'close_BGI$': [313.75, 312.73, 313.00],
    ... }
    >>> df = pd.DataFrame(data)
    >>> df['time'] = pd.to_datetime(df['time'])
    >>> print(pivoted)
    date       open_BGI$_0900  open_BGI$_0915  open_BGI$_0930  close_BGI$_0900  close_BGI$_0915  close_BGI$_0930
    2022-06-02        313.94          313.17          312.85           313.75          312.73          313.00
    """
    df['date'] = pd.to_datetime(df['time']).dt.date
    df['hour'] = pd.to_datetime(df['time']).dt.strftime('%H%M')

    columns_to_pivot = [col for col in df.columns if col not in ['time', 'date', 'hour']]
    pivoted = df.pivot(index='date', columns='hour', values=columns_to_pivot)
    pivoted.columns = [f"{col}_{hour}" for col, hour in pivoted.columns]

    pivoted.reset_index(inplace=True)
    return pivoted

In [3]:
input_target = os.path.join('..', 'data', 'target')
input_features = os.path.join('..', 'data', 'features')
output_dir = os.path.join('..', 'data', 'processed')


In [4]:
input_target

'..\\data\\target'

In [5]:
df_features = pd.read_parquet(os.path.join(input_features, 'features.parquet'))
D1_df_target = pd.read_parquet(os.path.join(input_target, 'D1_df_target.parquet'))
M15_df_target = pd.read_parquet(os.path.join(input_target, 'M15_df_target.parquet'))
D1_df = df_features.copy()
M15_df = df_features.copy()


In [6]:
df_features

Unnamed: 0,time,open_AGFS,high_AGFS,low_AGFS,close_AGFS,tick_volume_AGFS,spread_AGFS,real_volume_AGFS,open_BGI$,high_BGI$,...,EMA55_CCM$,EMA9_GOLD11,EMA21_GOLD11,EMA55_GOLD11,EMA9_IBOV,EMA21_IBOV,EMA55_IBOV,EMA9_DOL$,EMA21_DOL$,EMA55_DOL$
96,2022-06-02 09:00:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,320.52,321.21,...,122.110865,9.447520,9.432993,9.380451,111438.765770,111480.083554,111455.854537,5597.257017,5589.642700,5564.905769
97,2022-06-02 09:15:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,122.120834,9.448016,9.434539,9.382935,111423.012616,111469.166867,111452.431161,5593.956413,5588.834637,5565.471777
98,2022-06-02 09:30:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,122.120090,9.448413,9.435945,9.385330,111410.410093,111459.242606,111449.130048,5588.428931,5586.787761,5565.502035
99,2022-06-02 09:45:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.98,319.98,...,122.100087,9.448730,9.437222,9.387640,111400.328074,111450.220551,111445.946832,5584.584345,5585.189419,5565.634320
100,2022-06-02 10:00:00,1764.0,1778.0,1763.0,1777.0,9986.0,0.0,6702403.0,319.39,320.57,...,122.083655,9.446984,9.437475,9.389510,111577.062459,111526.018683,111475.877302,5583.240876,5584.523744,5566.071201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59647,2024-11-22 16:45:00,1627.0,1627.0,1626.0,1627.0,10782.0,0.0,5112900.0,356.77,357.34,...,72.310560,16.519841,16.493843,16.385959,128578.593827,128362.972960,127974.196554,5845.655706,5845.807123,5843.096395
59648,2024-11-22 17:00:00,1627.0,1627.0,1624.0,1624.0,12015.0,0.0,5752400.0,356.77,357.34,...,72.303754,16.517873,16.495312,16.390389,128564.675061,128376.248146,127993.296677,5846.037165,5845.966748,5843.255917
59649,2024-11-22 17:15:00,1623.0,1624.0,1619.0,1624.0,15587.0,0.0,22163000.0,356.77,357.34,...,72.297191,16.514298,16.495738,16.394303,128567.340049,128394.589223,128014.178938,5846.040732,5845.974771,5843.355884
59650,2024-11-22 17:30:00,1624.0,1631.0,1624.0,1630.0,20327.0,0.0,13137100.0,356.77,357.34,...,72.290863,16.509438,16.495216,16.397721,128628.072039,128437.899294,128044.779691,5845.340186,5845.662337,5843.326674


In [7]:
print(M15_df_target.shape)
print(M15_df_target.shape)
M15_df = M15_df.merge(
    M15_df_target[['time', 'close_price_target', 'open_price_target', 'behavior_target']],
    on='time',
    how='inner'
)
print(M15_df.shape)

(22356, 6)
(22356, 6)
(22356, 172)


In [8]:
M15_df.dropna(inplace=True)

In [9]:
M15_df

Unnamed: 0,time,open_AGFS,high_AGFS,low_AGFS,close_AGFS,tick_volume_AGFS,spread_AGFS,real_volume_AGFS,open_BGI$,high_BGI$,...,EMA55_GOLD11,EMA9_IBOV,EMA21_IBOV,EMA55_IBOV,EMA9_DOL$,EMA21_DOL$,EMA55_DOL$,close_price_target,open_price_target,behavior_target
0,2022-06-02 09:00:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,320.52,321.21,...,9.380451,111438.765770,111480.083554,111455.854537,5597.257017,5589.642700,5564.905769,319.29,319.73,0
1,2022-06-02 09:15:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,9.382935,111423.012616,111469.166867,111452.431161,5593.956413,5588.834637,5565.471777,319.29,319.73,0
2,2022-06-02 09:30:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,9.385330,111410.410093,111459.242606,111449.130048,5588.428931,5586.787761,5565.502035,319.34,319.98,1
3,2022-06-02 09:45:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.98,319.98,...,9.387640,111400.328074,111450.220551,111445.946832,5584.584345,5585.189419,5565.634320,320.13,319.39,1
4,2022-06-02 10:00:00,1764.0,1778.0,1763.0,1777.0,9986.0,0.0,6702403.0,319.39,320.57,...,9.389510,111577.062459,111526.018683,111475.877302,5583.240876,5584.523744,5566.071201,320.52,320.47,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22350,2024-11-22 16:30:00,1626.0,1628.0,1626.0,1627.0,11591.0,0.0,6756700.0,356.77,357.34,...,16.380994,128571.242283,128338.470256,127950.722352,5845.178883,5845.631535,5842.930965,357.34,356.77,0
22351,2024-11-22 16:45:00,1627.0,1627.0,1626.0,1627.0,10782.0,0.0,5112900.0,356.77,357.34,...,16.385959,128578.593827,128362.972960,127974.196554,5845.655706,5845.807123,5843.096395,357.34,356.77,0
22352,2024-11-22 17:00:00,1627.0,1627.0,1624.0,1624.0,12015.0,0.0,5752400.0,356.77,357.34,...,16.390389,128564.675061,128376.248146,127993.296677,5846.037165,5845.966748,5843.255917,357.34,356.77,0
22353,2024-11-22 17:15:00,1623.0,1624.0,1619.0,1624.0,15587.0,0.0,22163000.0,356.77,357.34,...,16.394303,128567.340049,128394.589223,128014.178938,5846.040732,5845.974771,5843.355884,357.34,356.77,0


In [10]:
D1_df =  pivot_data(D1_df.copy())


In [11]:
D1_df_target.reset_index(inplace=True)

In [12]:
D1_df = D1_df.merge(
    D1_df_target[['day', 'close_price_target', 'open_price_target', 'behavior_target']],
    left_on='date', right_on='day', how='inner'
)

In [13]:
D1_df.dropna(inplace=True)

In [14]:
save_table(D1_df.head(6), title = 'Exemplo do Target D1 para o fechamento, abertura e comportamento do mercado')
save_table(M15_df.head(6), title = 'Exemplo do Target M15 para o fechamento, abertura e comportamento do mercado')

Table saved as CSV: ..\results\tables\csv\Tabela_2_Exemplo do Target D1 para o fechamento, abertura e comportamento do mercado.csv
Table saved as CSV: ..\results\tables\csv\Tabela_3_Exemplo do Target M15 para o fechamento, abertura e comportamento do mercado.csv


In [15]:
os.makedirs(output_dir, exist_ok=True)
D1_df_path = os.path.join(output_dir, 'D1_df.parquet').replace("/", "\\")
M15_df_path = os.path.join(output_dir, 'M15_df.parquet').replace("/", "\\")

# Salvando os DataFrames
D1_df.to_parquet(D1_df_path)
M15_df.to_parquet(M15_df_path)

# Print para confirmar os caminhos
print(f"Data saved to: {D1_df_path}")
print(f"Data saved to: {M15_df_path}")

Data saved to: ..\data\processed\D1_df.parquet
Data saved to: ..\data\processed\M15_df.parquet


In [17]:
M15_df.head()

Unnamed: 0,time,open_AGFS,high_AGFS,low_AGFS,close_AGFS,tick_volume_AGFS,spread_AGFS,real_volume_AGFS,open_BGI$,high_BGI$,...,EMA55_GOLD11,EMA9_IBOV,EMA21_IBOV,EMA55_IBOV,EMA9_DOL$,EMA21_DOL$,EMA55_DOL$,close_price_target,open_price_target,behavior_target
0,2022-06-02 09:00:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,320.52,321.21,...,9.380451,111438.76577,111480.083554,111455.854537,5597.257017,5589.6427,5564.905769,319.29,319.73,0
1,2022-06-02 09:15:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,9.382935,111423.012616,111469.166867,111452.431161,5593.956413,5588.834637,5565.471777,319.29,319.73,0
2,2022-06-02 09:30:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.73,319.73,...,9.38533,111410.410093,111459.242606,111449.130048,5588.428931,5586.787761,5565.502035,319.34,319.98,1
3,2022-06-02 09:45:00,1767.0,1767.0,1763.0,1763.0,0.0,0.0,0.0,319.98,319.98,...,9.38764,111400.328074,111450.220551,111445.946832,5584.584345,5585.189419,5565.63432,320.13,319.39,1
4,2022-06-02 10:00:00,1764.0,1778.0,1763.0,1777.0,9986.0,0.0,6702403.0,319.39,320.57,...,9.38951,111577.062459,111526.018683,111475.877302,5583.240876,5584.523744,5566.071201,320.52,320.47,1
