In [2]:
import sys
import subprocess
import pkg_resources

def install_packages():
    required_packages = [
        "numpy",
        "pandas",
        "scikit-learn",
        "joblib",
        "pyarrow",
        "fastparquet",
        "plotly",
        "kaleido",
        "matplotlib"
    ]
    
    installed_packages = {pkg.key for pkg in pkg_resources.working_set}

    for package in required_packages:
        if package.lower() not in installed_packages:
            print(f"Instalando {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        else:
            print(f"{package} já está instalado.")
    
    print("Todos os pacotes estão verificados.")

install_packages()

numpy já está instalado.
pandas já está instalado.
scikit-learn já está instalado.
joblib já está instalado.
pyarrow já está instalado.
fastparquet já está instalado.
plotly já está instalado.
kaleido já está instalado.
matplotlib já está instalado.
Todos os pacotes estão verificados.


In [3]:
import pandas as pd
from datetime import datetime
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
from pandas.plotting import table


LOADING DATA FROM BASE\ATIVOS

In [4]:
directory_path = '..//bases//ativos//'


In [5]:
BGI = pd.read_parquet(f'{directory_path}BGI$_data_20220601.parquet')

In [6]:
BGI.head()

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume
0,2022-06-01 09:00:00,313.1,316.52,313.1,315.66,26,1,28
1,2022-06-01 09:15:00,315.76,315.96,313.2,313.2,59,1,91
2,2022-06-01 09:30:00,313.05,313.25,311.99,312.49,27,1,41
3,2022-06-01 10:00:00,313.3,313.95,313.1,313.95,23,1,30
4,2022-06-01 10:30:00,313.5,313.5,313.25,313.25,3,1,4


In [7]:
BGI.time.max()
BGI.time.min()

Timestamp('2022-06-01 09:00:00')

In [None]:
import os
import pandas as pd

def save_table(df, title, output_path='../results/tables/'):
    """
    Saves the DataFrame as a CSV file. If a file with the exact title exists,
    it will overwrite the existing file. Otherwise, it will create a new file
    with the next available number.

    Args:
        df (pd.DataFrame): DataFrame to be saved.
        title (str): Title to be used in the CSV filename.
        output_path (str): Path where the file will be saved.
    """
    os.makedirs(os.path.join(output_path, 'csv'), exist_ok=True)
    csv_path = os.path.join(output_path, 'csv')
    existing_files = [f for f in os.listdir(csv_path) if title in f and f.endswith('.csv')]

    # Check if we have a file that contains the title in its name
    if existing_files:
        # If a match exists, overwrite that file
        file_name_csv = existing_files[0]
        csv_output_path = os.path.join(csv_path, file_name_csv)
    else:
        # If no match, find the next available number
        num = len([f for f in os.listdir(csv_path) if f.startswith('Tabela_')])
        num += 1
        file_name_csv = f"Tabela_{num}_{title}.csv"
        csv_output_path = os.path.join(csv_path, file_name_csv)

    # Save the DataFrame to the generated path
    df.to_csv(csv_output_path, index=False)
    print(f"Tabela saved as CSV: {csv_output_path}")
    
def check_data(directory_path):
    """
    Loads all .parquet files from the specified directory, processes them by:
    - Extracting the ticker symbol from the filename (everything before the first '_').
    - Creating a 'date' column with unique dates and dropping duplicates.
    - Storing the ticker, first date, last date, and the number of columns in the DataFrame for each processed file.

    Args:
        directory_path (str): The path to the directory containing the .parquet files.

    Returns:
        pd.DataFrame: A DataFrame containing the summary information (ticker, first date, last date, and shape).
    """
    parquet_files = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    
    summary_data = [] 
    
    for parquet_file in parquet_files:
        ticker = parquet_file.split('_')[0]  # Extracts the name before the first '_'
        file_path = os.path.join(directory_path, parquet_file)
        df = pd.read_parquet(file_path)
        df['date'] = df['time'].dt.strftime('%Y-%m-%d')
        first_date = df['date'].min()
        last_date = df['date'].max()

        summary_data.append({
            'ticker': ticker,
            'first_date': first_date,
            'last_date': last_date,
            'rows': df.shape[0],
            'columns': df.shape[1],
            'unique_dates': len(df.date.unique())
        })
    
    data_info_df = pd.DataFrame(summary_data)
    save_table(data_info_df, title='Visualizaçao das séries de dados escolhidas')
    return data_info_df

def ts_elegant_inputer(df, interval_minutes=15, min_time='09:00:00', max_time='17:45:00'):
    """
    Fills missing timestamps within the specified date ranges by recursively filling the missing data points.
    The function ensures that all time intervals are present for each date, filling in missing timestamps 
    by copying the previous row's values, while setting `tick_volume` and `real_volume` to zero.
    
    Args:
        df (pd.DataFrame): DataFrame containing the time series data with a 'time' column (timestamp).
        interval_minutes (int, optional): Time interval in minutes (default is 15).
        min_time (str, optional): Minimum time of the range in 'HH:MM:SS' format (default is '09:00:00').
        max_time (str, optional): Maximum time of the range in 'HH:MM:SS' format (default is '17:45:00').
    
    Returns:
        pd.DataFrame: The DataFrame with missing timestamps filled and new rows inserted where necessary.
    """
    df['time'] = pd.to_datetime(df['time'])
    df['date'] = df['time'].dt.strftime('%Y-%m-%d')
    
    min_time = pd.to_datetime(min_time, format='%H:%M:%S').time()
    max_time = pd.to_datetime(max_time, format='%H:%M:%S').time()
    
    def generate_time_intervals(date):
        date_start = pd.to_datetime(f"{date} {min_time}")
        date_end = pd.to_datetime(f"{date} {max_time}")
        time_intervals = pd.date_range(date_start, date_end, freq=f'{interval_minutes}T')
        return time_intervals
    
    result_data = []

    for date in df['date'].unique():
        expected_times = generate_time_intervals(date)
        
        date_df = df[df['date'] == date]
        date_df = date_df.sort_values(by='time')
        
        last_row = None
        expected_idx = 0
        
        for expected_time in expected_times:
            # Check if the current expected time exists in the DataFrame
            if expected_idx < len(date_df) and date_df.iloc[expected_idx]['time'] == expected_time:
                # If the timestamp exists, add it to the result
                result_data.append(date_df.iloc[expected_idx].to_dict())
                last_row = date_df.iloc[expected_idx]
                expected_idx += 1
            else:
                # If the timestamp doesn't exist, fill it by copying the previous row's data
                if last_row is not None:
                    new_row = last_row.copy()
                    new_row['time'] = expected_time
                    new_row['tick_volume'] = 0
                    new_row['real_volume'] = 0
                    result_data.append(new_row.to_dict())
                    last_row = new_row
    
    result_df = pd.DataFrame(result_data)
    
    return result_df


In [19]:
check_data(directory_path)


Tabela saved as CSV: ../results/tables/csv\Tabela_1_Visualizaçao das séries de dados escolhidas.csv


Unnamed: 0,ticker,first_date,last_date,rows,columns,unique_dates
0,BGI$,2022-06-01,2024-11-22,17680,9,622
1,CCM$,2022-06-01,2024-11-22,18615,9,622
2,GOLD11,2022-06-01,2024-11-22,18122,9,622
3,IBOV,2022-06-01,2024-11-22,18128,9,622
4,ICF$,2022-06-01,2024-11-22,15490,9,622
5,IVVB11,2022-06-01,2024-11-22,18128,9,622


In [42]:
filled_df = ts_elegant_inputer(BGI)


  time_intervals = pd.date_range(date_start, date_end, freq=f'{interval_minutes}T')


In [20]:
filled_df.head()

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,date
0,2022-06-01 09:00:00,313.1,316.52,313.1,315.66,26,1,28,2022-06-01
1,2022-06-01 09:15:00,315.76,315.96,313.2,313.2,59,1,91,2022-06-01
2,2022-06-01 09:30:00,313.05,313.25,311.99,312.49,27,1,41,2022-06-01
3,2022-06-01 09:45:00,313.05,313.25,311.99,312.49,0,1,0,2022-06-01
4,2022-06-01 10:00:00,313.3,313.95,313.1,313.95,23,1,30,2022-06-01


In [None]:
CCM = pd.read_parquet(f'{directory_path}CCM$_data_20220601.parquet')

In [None]:
CCM.time.max()
CCM.time.max()
CCM['dia'] = CCM['time'].dt.strftime('%Y-%m-%d')
CCM.shape
ccm_aux = CCM['dia'].drop_duplicates()
ccm_aux.shape