In [2]:
import pandas as pd

# Carregar dados da camada silver
df_clean = pd.read_csv('data/silver/dados_limpos.csv')

# Linhas stockcode de tarifas
stockcode_fees = ['C2', 'DOT', 'POST','AMAZONFEE']
df_clean['total_value'] = df_clean['Quantity'] * df_clean['UnitPrice']

# =========================================
#               TABELAS FATO
# =========================================

# --------- Dim Country -----------
dim_country = (
    df_clean[['Country']]
    .drop_duplicates()
    .copy()
)
dim_country = dim_country.reset_index(drop=True)
dim_country['CountryID'] = dim_country.index + 1

# --------- Dim Date -----------
dim_date = (
    df_clean[['InvoiceDate']]
    .drop_duplicates(subset=['InvoiceDate'])
    .copy()
)
dim_date = dim_date.reset_index(drop=True)
dim_date['InvoiceDate'] = pd.to_datetime(dim_date['InvoiceDate'])
dim_date['DateID'] = dim_date.index + 1
dim_date['Year'] = dim_date['InvoiceDate'].dt.year
dim_date['Month'] = dim_date['InvoiceDate'].dt.month
dim_date['Day'] = dim_date['InvoiceDate'].dt.day
dim_date['Weekday'] = dim_date['InvoiceDate'].dt.day_name()
dim_date['Hour'] = dim_date['InvoiceDate'].dt.hour

# Mapeamentos para substituição
country_map = dict(zip(dim_country['Country'], dim_country['CountryID']))
date_map = dict(zip(dim_date['InvoiceDate'], dim_date['DateID']))

# --------- Fato Vendas -----------
fact_sales = df_clean[
    (~df_clean['InvoiceNo'].astype(str).str.startswith(('C', 'A'))) &
    (df_clean['Quantity'] > 0) &
    (df_clean['UnitPrice'] > 0) &
    (~df_clean['StockCode'].isin(stockcode_fees))
].copy()

# Garanta que CustomerID é sempre string
fact_sales['CustomerID'] = fact_sales['CustomerID'].fillna('Unknown').astype(str)
fact_sales['CountryID'] = fact_sales['Country'].map(country_map)
fact_sales['DateID'] = pd.to_datetime(fact_sales['InvoiceDate']).map(date_map)

fact_sales = fact_sales[[
    'InvoiceNo', 'StockCode', 'CustomerID', 'DateID',
    'Quantity', 'UnitPrice', 'total_value', 'CountryID'
]]

# --------- Fato Tarifas -----------
fact_fees = df_clean[
    (df_clean['StockCode'].isin(stockcode_fees)) &
    (~df_clean['InvoiceNo'].astype(str).str.startswith(('C', 'A')))].copy()
fact_fees['CustomerID'] = fact_fees['CustomerID'].astype(str)
fact_fees['CountryID'] = fact_fees['Country'].map(country_map)
fact_fees['DateID'] = pd.to_datetime(fact_fees['InvoiceDate']).map(date_map)
fact_fees = fact_fees[[
    'InvoiceNo', 'StockCode', 'CustomerID', 'DateID',
    'Quantity', 'UnitPrice', 'total_value', 'CountryID'
]]

# --------- Fato Cancelamentos -----------
fact_cancellations = df_clean[
    (df_clean['InvoiceNo'].astype(str).str.startswith(('C', 'A'))) &
    (~df_clean['StockCode'].isin(['C2', 'DOT', 'POST']))
].copy()
fact_cancellations['CustomerID'] = fact_cancellations['CustomerID'].astype(str)
fact_cancellations['CountryID'] = fact_cancellations['Country'].map(country_map)
fact_cancellations['DateID'] = pd.to_datetime(fact_cancellations['InvoiceDate']).map(date_map)
fact_cancellations = fact_cancellations[[
    'InvoiceNo', 'StockCode', 'CustomerID', 'DateID',
    'Quantity', 'UnitPrice', 'total_value', 'CountryID'
]]

# =========================================
#               TABELAS DIMENSAO
# =========================================

# --------- Dim Clientes -----------
dim_customer = (
    fact_sales[['CustomerID', 'CountryID']]
    .drop_duplicates(subset=['CustomerID'])
    .copy()
)
# Garante existencia do cliente 'Unknown'
if 'Unknown' not in dim_customer['CustomerID'].values:
    unknown_country_id = dim_country[dim_country['Country'] == 'Desconhecido']['CountryID'].values[0] if 'Desconhecido' in dim_country['Country'].values else None
    dim_customer.loc[len(dim_customer)] = ['Unknown', unknown_country_id]

# --------- Dim Produtos -----------
dim_product = (
    df_clean[['StockCode', 'Description']]
    .drop_duplicates(subset=['StockCode'])
    .rename(columns={'Description': 'ProductDescription'})
    .copy()
)

# --------- Dim Date -----------
# Já criada acima como dim_date, apenas reordenar colunas
dim_date = dim_date[[
    'DateID', 'InvoiceDate', 'Year', 'Month', 'Day', 'Weekday', 'Hour'
]]

# --------- Dim Country -----------
dim_country = dim_country[['CountryID', 'Country']]



In [3]:
# Classificando vendas/tarifas/cancelamentos
fact_sales['TransactionType'] = 'Sale'
fact_fees['TransactionType'] = 'Fee'
fact_cancellations['TransactionType'] = 'Cancellation'

# Unindo tabelas fatos em uma só
fact_all = pd.concat(
    [fact_sales, fact_fees, fact_cancellations],
    ignore_index=True
)

In [4]:
# Salvar tabelas finais (camada Gold) como Parquet para uso analítico
gold_path = "data/gold/"

dim_country.to_parquet(f'{gold_path}dim_country.parquet', index=False)
dim_date.to_parquet(f'{gold_path}dim_date.parquet', index=False)
dim_customer.to_parquet(f'{gold_path}dim_customer.parquet', index=False)
dim_product.to_parquet(f'{gold_path}dim_product.parquet', index=False)
fact_all.to_parquet(f'{gold_path}fact_all.parquet', index=False)

print("Tabelas salvas na camada Gold.")

Tabelas salvas na camada Gold.


In [5]:
display(fact_sales['total_value'].sum())

np.float64(10350459.744)