In [1]:
# B3 Data Visualization Dashboard

# This notebook contains visualizations and analyses of B3 data stored in the AWS Glue Data Catalog.

In [2]:
# Installation of required libraries
%pip install PyAthena pandas plotly boto3 sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import libraries
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [4]:
# Connection settings (adjust as needed)
AWS_REGION = os.getenv('AWS_REGION', 'us-east-1')
S3_STAGING_DIR = os.getenv('S3_STAGING_DIR', 's3://861115334572-athena/b3-visualization/')
DATABASE = 'default'
TABLE = 'b3_tbl_refined'

In [5]:
# SQL query to obtain the required data
df_query = f'''
SELECT code, reference_date, part, ticker
FROM {TABLE}
WHERE reference_date IS NOT NULL AND part IS NOT NULL AND ticker IS NOT NULL
AND code IN ('ITUB4', 'BPAC11', 'PETR4', 'VALE3', 'BBAS3')
ORDER BY reference_date, ticker
'''

In [6]:
# Connection to Athena
from sqlalchemy.engine import create_engine

conn_str = (
    f"awsathena+rest://@athena.{AWS_REGION}.amazonaws.com:443/{DATABASE}"
    f"?s3_staging_dir={S3_STAGING_DIR}"
)

conn = create_engine(conn_str)

In [7]:
# Executes the query and loads into DataFrame
print('Executing Athena query...')
df = pd.read_sql(df_query, conn)

Executing Athena query...


In [8]:
# Convert reference_date to datetime
if not pd.api.types.is_datetime64_any_dtype(df['reference_date']):
    df['reference_date'] = pd.to_datetime(df['reference_date'])

print('Sample of the data:')
print(df.head())

Sample of the data:
     code reference_date    part        ticker
0   BBAS3     2025-06-20   2.889        BRASIL
1  BPAC11     2025-06-20   2.507    BTGP BANCO
2   ITUB4     2025-06-20   8.166  ITAUUNIBANCO
3   PETR4     2025-06-20   6.765     PETROBRAS
4   VALE3     2025-06-20  10.199          VALE


In [9]:
# Line chart: Temporal Evolution of Participation
fig = px.line(
    df,
    x='reference_date',
    y='part',
    color='ticker',
    title='Temporal Evolution of Participation by Ticker',
    labels={'reference_date': 'Reference Date', 'part': 'Participation', 'ticker': 'Ticker'}
)
fig.update_layout(xaxis_title='Date', yaxis_title='Participation (%)')
fig.show() 

In [10]:
# --- DASHBOARD: Time series with statistical bands ---
print("\n--- Dashboard: Time series with statistical bands ---")


--- Dashboard: Time series with statistical bands ---


In [11]:
# Query for means and standard deviations
band_query = f'''
SELECT code, reference_date, ticker, mean_part_7_days, std_part_7_days
FROM {TABLE}
WHERE reference_date IS NOT NULL AND mean_part_7_days IS NOT NULL AND std_part_7_days IS NOT NULL AND ticker IS NOT NULL
AND code IN ('ITUB4', 'BPAC11', 'PETR4', 'VALE3', 'BBAS3')
ORDER BY reference_date, ticker
'''

band_df = pd.read_sql(band_query, conn)

In [12]:
# Converte reference_date para datetime
if not pd.api.types.is_datetime64_any_dtype(band_df['reference_date']):
    band_df['reference_date'] = pd.to_datetime(band_df['reference_date'])

In [13]:
# Calculate upper and lower limits
band_df['upper'] = band_df['mean_part_7_days'] + band_df['std_part_7_days']
band_df['lower'] = band_df['mean_part_7_days'] - band_df['std_part_7_days']

In [14]:
# Plotly chart: line + shaded band
tickers = band_df['ticker'].unique()
for ticker in tickers:
    df_ticker = band_df[band_df['ticker'] == ticker]
    fig = go.Figure()
    # Shaded band
    fig.add_traces([
        go.Scatter(
            x=pd.concat([df_ticker['reference_date'], df_ticker['reference_date'][::-1]]),
            y=pd.concat([df_ticker['upper'], df_ticker['lower'][::-1]]),
            fill='toself',
            fillcolor='rgba(0,100,80,0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            hoverinfo="skip",
            showlegend=False,
            name='Confidence Interval'
        ),
        go.Scatter(
            x=df_ticker['reference_date'],
            y=df_ticker['mean_part_7_days'],
            line=dict(color='rgb(0,100,80)'),
            name=f'7-day Mean ({ticker})'
        )
    ])
    fig.update_layout(
        title=f'Time Series with Statistical Bands - {ticker}',
        xaxis_title='Date',
        yaxis_title='Average Participation (7 days)',
        legend_title='Legend'
    )
    fig.show() 

In [15]:
# --- Boxplot of part by ticker ---
print("\n--- Dashboard: Boxplot of part by ticker ---")

boxplot_query = f'''
SELECT part, ticker, code
FROM {TABLE}
WHERE part IS NOT NULL AND ticker IS NOT NULL
AND cast(reference_date as date) = date_add('day', -1, current_date)
'''

boxplot_df = pd.read_sql(boxplot_query, conn)

# Boxplot chart with plotly
fig = px.box(
    boxplot_df,
    x='ticker',
    y='part',
    title='Distribution of Participation (part) by Ticker',
    labels={'ticker': 'Ticker', 'part': 'Participation'}
)
fig.update_layout(xaxis_title='Ticker', yaxis_title='Participation (%)')
fig.show() 


--- Dashboard: Boxplot of part by ticker ---


In [16]:
# --- Scatter plot mean_part_7_days vs std_part_7_days ---
print("\n--- Dashboard: Scatter plot mean_part_7_days vs std_part_7_days ---")

scatter_query = f'''
SELECT mean_part_7_days, std_part_7_days, ticker
FROM {TABLE}
WHERE mean_part_7_days IS NOT NULL AND std_part_7_days IS NOT NULL AND ticker IS NOT NULL
AND cast(reference_date as date) = date_add('day', -1, current_date)
'''

scatter_df = pd.read_sql(scatter_query, conn)

fig = px.scatter(
    scatter_df,
    x='mean_part_7_days',
    y='std_part_7_days',
    color='ticker',
    title='Scatter: Mean vs Volatility of Participation (by Ticker)',
    labels={'mean_part_7_days': 'Mean (7 days)', 'std_part_7_days': 'Standard Deviation (7 days)', 'ticker': 'Ticker'},
    hover_data=['ticker']
)
fig.update_layout(xaxis_title='Mean Participation (7 days)', yaxis_title='Volatility (Standard Deviation 7 days)')
fig.show()


--- Dashboard: Scatter plot mean_part_7_days vs std_part_7_days ---
