In [2]:
import sys
import subprocess
import pkg_resources

def install_packages():
    required_packages = [
        "numpy",
        "pandas",
        "scikit-learn",
        "joblib",
        "pyarrow",
        "fastparquet",
        "plotly",
        "matplotlib"
    ]
    
    installed_packages = {pkg.key for pkg in pkg_resources.working_set}

    for package in required_packages:
        if package.lower() not in installed_packages:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        else:
            print(f"{package} is already installed.")
    
    print("All packages are verified")

install_packages()
import pandas as pd
import os
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)


numpy is already installed.
pandas is already installed.
scikit-learn is already installed.
joblib is already installed.
pyarrow is already installed.
fastparquet is already installed.
plotly is already installed.
matplotlib is already installed.
All packages are verified


In [4]:
input_dir = '..//data//features//'
output_dir = '..//data//target//'

In [5]:
df = pd.read_parquet(f'{input_dir}features.parquet')
print('df carregado', df.shape)


df carregado (22356, 85)


In [11]:
df.columns

Index(['time', 'open_BGI$', 'high_BGI$', 'low_BGI$', 'close_BGI$',
       'tick_volume_BGI$', 'spread_BGI$', 'real_volume_BGI$', 'open_CCM$',
       'high_CCM$', 'low_CCM$', 'close_CCM$', 'tick_volume_CCM$',
       'spread_CCM$', 'real_volume_CCM$', 'open_GOLD11', 'high_GOLD11',
       'low_GOLD11', 'close_GOLD11', 'tick_volume_GOLD11', 'spread_GOLD11',
       'real_volume_GOLD11', 'open_IBOV', 'high_IBOV', 'low_IBOV',
       'close_IBOV', 'tick_volume_IBOV', 'spread_IBOV', 'real_volume_IBOV',
       'open_ICF$', 'high_ICF$', 'low_ICF$', 'close_ICF$', 'tick_volume_ICF$',
       'spread_ICF$', 'real_volume_ICF$', 'open_IVVB11', 'high_IVVB11',
       'low_IVVB11', 'close_IVVB11', 'tick_volume_IVVB11', 'spread_IVVB11',
       'real_volume_IVVB11', 'OBV_ICF$', 'OBV_IVVB11', 'OBV_CCM$',
       'OBV_GOLD11', 'OBV_BGI$', 'OBV_IBOV', 'RSI_ICF$', 'RSI_IVVB11',
       'RSI_CCM$', 'RSI_GOLD11', 'RSI_BGI$', 'RSI_IBOV', 'ATR_ICF$',
       'ATR_IVVB11', 'ATR_CCM$', 'ATR_GOLD11', 'ATR_BGI$', 'ATR_IBO

In [21]:
def generate_targets(df, asset):
    """
    Generate targets based on the closing and opening prices of the specified asset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing asset data.
    asset (str): Name of the asset for which to calculate the targets.
    
    Returns:
    pd.DataFrame: DataFrame with daily opening and closing prices, and targets.

    Example:
    Given the following data for asset 'BGI$':

    | time                | open_BGI$ | close_BGI$ |
    |---------------------|-----------|------------|
    | 2024-12-01 09:00:00 | 323.57    | 322.07     |
    | 2024-12-01 17:45:00 | 313.94    | 288.87     |
    | 2024-12-02 09:00:00 | 318.39    | 287.36     |

    Calling `generate_targets(df, asset='BGI$')` will return:

    | day        | open_BGI$ | close_BGI$ | close_price_target | open_price_target | behavior_target |
    |------------|-----------|------------|--------------------|-------------------|-----------------|
    | 2024-12-01 | 323.57    | 288.87     | 287.36             | 318.39            | 0               |
    | 2024-12-02 | 318.39    | 287.36     | ...                | ...               | ...             |
    """
    close_col = f'close_{asset}'
    open_col = f'open_{asset}'
    
    if close_col not in df.columns or open_col not in df.columns:
        raise KeyError(f'Columns for {asset} ({open_col}, {close_col}) not found in the DataFrame')

    df['day'] = pd.to_datetime(df['time']).dt.date  # Ensure only the date (without time)
    
    aux_open = df.groupby('day').first()[[open_col]]
    aux_close = df.groupby('day').last()[[close_col]]
    
    target_df = pd.concat([aux_open, aux_close], axis=1)
    
    target_df['close_price_target'] = target_df[close_col].shift(-1)
    target_df['open_price_target'] = target_df[open_col].shift(-1)
    target_df['behavior_target'] = (target_df['close_price_target'] > target_df[close_col]).astype(int)
    
    return target_df

In [22]:
target_df = generate_targets(df, asset='BGI$')

In [23]:
target_df.head()

Unnamed: 0_level_0,open_BGI$,close_BGI$,close_price_target,open_price_target,behavior_target
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-06-02,313.94,318.78,325.41,318.39,1
2022-06-03,318.39,325.41,325.07,325.07,0
2022-06-06,325.07,325.07,324.1,324.68,0
2022-06-07,324.68,324.1,324.0,325.07,0
2022-06-08,325.07,324.0,325.99,324.54,1


In [None]:
os.makedirs(output_dir, exist_ok=True)
df.to_parquet(f'{output_dir}target.parquet')