In [4]:
import pandas as pd
import unicodedata
import re
import hashlib
from sqlalchemy import create_engine, inspect, text
import os
import time
import csv
import io
import numpy as np

In [5]:
engine = create_engine(
    "postgresql://bi_bancos2:1q2w3e@db:5432/viagens_db"
)

In [6]:
df = pd.read_sql("SELECT * FROM viagem", engine)

In [9]:
df.columns

Index(['ide_pro_via', 'nmr_ppt_pcd', 'situacao', 'via_urg', 'jus_urg_via',
       'cod_org_sup', 'nom_org_sup', 'cod_org_sol', 'nom_org_sol', 'cpf_vjt',
       'nome_vjt', 'cargo', 'funcao', 'dat_ini', 'dat_fim', 'destinos',
       'motivo', 'val_dia', 'val_pas', 'val_dev', 'val_out_gas', 'tot_gas',
       'mes_ida', 'mes_vol'],
      dtype='str')

In [10]:
df.columns = (
    df.columns
      .str.normalize('NFKD')
      .str.encode('ascii', errors='ignore')
      .str.decode('utf-8')
      .str.lower()
      .str.replace(r'[^\w]+', '_', regex=True)
      .str.strip('_')
)

In [4]:
df['dat_ini'] = pd.to_datetime(df['dat_ini'])
df['dat_fim'] = pd.to_datetime(df['dat_fim'])

In [5]:
df['drc_vgm_dia'] = (df['dat_fim'] - df['dat_ini']).dt.days + 1

In [6]:
df['cst_med_dia'] = (
    df['tot_gst']
    .where(df['drc_vgm_dia'] > 0)
    / df['drc_vgm_dia']
).round(2)

In [11]:
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://bi_bancos2:1q2w3e@db:5432/viagens_db')
engine = create_engine(DATABASE_URL)

In [9]:
def limpar_tabelas(engine):
    with engine.connect() as conn:
        conn.execute(text("TRUNCATE TABLE dw_gold.fat_vgm, gold.dim_tmp, gold.dim_org_sup CASCADE;"))
        conn.commit()
        print("Tabelas limpas para nova carga.")

limpar_tabelas(engine)

Tabelas limpas para nova carga.


In [11]:
mapa_dias = {
    0: 'Segunda-feira',
    1: 'Terça-feira',
    2: 'Quarta-feira',
    3: 'Quinta-feira',
    4: 'Sexta-feira',
    5: 'Sábado',
    6: 'Domingo'
}

dim_tmp = (
    df[['dat_ini']]
    .drop_duplicates()
    .assign(
        ano=lambda x: x['dat_ini'].dt.year,
        mes_num=lambda x: x['dat_ini'].dt.month,
        mes_nom=lambda x: x['dat_ini'].dt.month_name(),
        dia=lambda x: x['dat_ini'].dt.day,
        dia_smn_nom=lambda x: x['dat_ini'].dt.weekday.map(mapa_dias)
    )
    .reset_index(drop=True)
)

dim_tmp = dim_tmp[[
    'dat_ini',
    'ano',
    'mes_num',
    'mes_nom',
    'dia_smn_nom'
]]

In [12]:
dim_org_sup = df[['cod_org_sup', 'nom_org_sup']].drop_duplicates().copy()

dim_org_sup['cod_org_sup'] = dim_org_sup['cod_org_sup'].astype('Int64')

dim_org_sup = dim_org_sup[[
    'cod_org_sup',
    'nom_org_sup'
]]

In [13]:
dim_org_sol = (
    df[['cod_org_sol', 'nom_org_sol', 'cod_org_sup']]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_org_sol['cod_org_sol'] = (
    dim_org_sol['cod_org_sol'].astype('Int64')
)
dim_org_sol['cod_org_sup'] = (
    dim_org_sol['cod_org_sup'].astype('Int64')
)

dim_org_sol = dim_org_sol[[
    'cod_org_sol',
    'nom_org_sol',
    'cod_org_sup'
]]

In [14]:
dim_vjt = (
    df[['cpf_vjt', 'nom', 'cargo', 'dsc_fun']]
    .fillna('')
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_vjt = dim_vjt[[
    'cpf_vjt',
    'nom',
    'cargo',
    'dsc_fun'
]]


In [15]:
df = df.copy()

df['motivo'] = (
    df['motivo']
    .fillna('Sem motivo')
    .astype(str)
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8')
    .str.replace(r'[\x00-\x1F\x7F-\x9F]', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

df.loc[df['motivo'] == '', 'motivo'] = 'Sem motivo'

dim_mtv = (
    df[['motivo']]
    .drop_duplicates()
    .reset_index(drop=True)
)


In [16]:
fat_vgm = (
    df
    .merge(dim_tmp[['dat_ini', 'srk_tmp']], on='dat_ini', how='left')
    .merge(dim_orgao_superior[['cod_org_sup', 'srk_org_sup']], on='cod_org_sup', how='left')
    .merge(dim_orgao_solicitante[['cod_org_sol', 'srk_org_sol']], on='cod_org_sol', how='left')
    .merge(dim_viajante[['cpf_vjt', 'srk_vjt']], on='cpf_vjt', how='left')
    .merge(dim_motivo[['mtv', 'srk_mtv']], on='mtv', how='left')
)

fat_vgm = fat_vgm[[
    'srk_tmp',
    'srk_org_sup',
    'srk_org_sol',
    'srk_vjt',
    'srk_mtv',
    'vlr_dia',
    'vlr_psg',
    'vlr_out',
    'vlr_dvl',
    'tot_gst',
    'drc_vgm_dia',
    'cst_med_dia'
]].reset_index(drop=True)


assert fat_vgm.isna().sum().sum() == 0, "❌ Existem valores NULL na tabela fato"


In [17]:
fat_viagens.isna().sum()

fat_viagem_id           0
tempo_id                0
orgao_superior_id       0
orgao_solicitante_id    0
viajante_id             0
motivo_id               0
valor_diarias           0
valor_passagens         0
valor_outros_gastos     0
valor_devolucao         0
total_gasto             0
duracao_viagem_dias     0
custo_medio_diario      0
dtype: int64

In [None]:
df_dim_tmp.to_sql('dim_tmp', engine, schema='dw_gold', if_exists='append', index=False)
df_dim_org_sup.to_sql('dim_org_sup', engine, schema='dw_gold', if_exists='append', index=False)
df_dim_org_sol.to_sql('dim_org_sol', engine, schema='dw_gold', if_exists='append', index=False)
df_dim_vjt.to_sql('dim_vjt', engine, schema='dw_gold', if_exists='append', index=False)
df_dim_mtv.to_sql('dim_mtv', engine, schema='dw_gold', if_exists='append', index=False)

df_fato.to_sql('fat_vgm', engine, schema='dw_gold', if_exists='append', index=False)
