In [1]:
import pandas as pd
import unicodedata
import re
import hashlib
from sqlalchemy import create_engine, inspect, text
import os
import time
import csv
import io
import numpy as np

In [2]:
engine = create_engine(
    "postgresql://bi_bancos2:1q2w3e@db:5432/viagens_db"
)

In [3]:
df = pd.read_sql("SELECT * FROM viagem", engine)

In [4]:
df.columns

Index(['ide_pro_via', 'nmr_ppt_pcd', 'situacao', 'via_urg', 'jus_urg_via',
       'cod_org_sup', 'nom_org_sup', 'cod_org_sol', 'nom_org_sol', 'cpf_vjt',
       'nom_vjt', 'cargo', 'funcao', 'dat_ini', 'dat_fim', 'destinos',
       'motivo', 'vlr_dia', 'vlr_psg', 'vlr_dvl', 'vlr_out', 'tot_gas',
       'mes_ida', 'mes_vol'],
      dtype='str')

In [5]:
df.columns = (
    df.columns
      .str.normalize('NFKD')
      .str.encode('ascii', errors='ignore')
      .str.decode('utf-8')
      .str.lower()
      .str.replace(r'[^\w]+', '_', regex=True)
      .str.strip('_')
)

In [6]:
df['dat_ini'] = pd.to_datetime(df['dat_ini'])
df['dat_fim'] = pd.to_datetime(df['dat_fim'])

In [7]:
df['drc_vgm_dia'] = (df['dat_fim'] - df['dat_ini']).dt.days + 1

In [8]:
df['cst_med_dia'] = (
    df['tot_gas']
    .where(df['drc_vgm_dia'] > 0)
    / df['drc_vgm_dia']
).round(2)

In [9]:
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://bi_bancos2:1q2w3e@db:5432/viagens_db')
engine = create_engine(DATABASE_URL)

In [10]:
def limpar_tabelas(engine):
    with engine.connect() as conn:
        conn.execute(text("TRUNCATE TABLE dw_gold.fat_vgm, dw_gold.dim_tmp, dw_gold.dim_org_sup CASCADE;"))
        conn.commit()
        print("Tabelas limpas para nova carga.")

limpar_tabelas(engine)

Tabelas limpas para nova carga.


In [11]:
mapa_dias = {
    0: 'Segunda-feira',
    1: 'Terça-feira',
    2: 'Quarta-feira',
    3: 'Quinta-feira',
    4: 'Sexta-feira',
    5: 'Sábado',
    6: 'Domingo'
}

dim_tmp = (
    df[['dat_ini']]
    .drop_duplicates()
    .assign(
        ano=lambda x: x['dat_ini'].dt.year,
        mes_num=lambda x: x['dat_ini'].dt.month,
        mes_nom=lambda x: x['dat_ini'].dt.month_name(),
        dia=lambda x: x['dat_ini'].dt.day,
        dia_smn_nom=lambda x: x['dat_ini'].dt.weekday.map(mapa_dias)
    )
    .reset_index(drop=True)
)

dim_tmp = dim_tmp[[
    'dat_ini',
    'ano',
    'mes_num',
    'mes_nom',
    'dia_smn_nom'
]]

In [12]:
dim_org_sup = df[['cod_org_sup', 'nom_org_sup']].drop_duplicates().copy()

dim_org_sup['cod_org_sup'] = dim_org_sup['cod_org_sup'].astype('Int64')

dim_org_sup = dim_org_sup[[
    'cod_org_sup',
    'nom_org_sup'
]]

In [13]:
dim_org_sol = (
    df[['cod_org_sol', 'nom_org_sol', 'cod_org_sup']]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_org_sol['cod_org_sol'] = (
    dim_org_sol['cod_org_sol'].astype('Int64')
)
dim_org_sol['cod_org_sup'] = (
    dim_org_sol['cod_org_sup'].astype('Int64')
)

dim_org_sol = dim_org_sol[[
    'cod_org_sol',
    'nom_org_sol',
    'cod_org_sup'
]]

In [14]:
dim_vjt = (
    df[['cpf_vjt', 'nom_vjt', 'cargo', 'funcao']]
    .fillna('')
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_vjt = dim_vjt[[
    'cpf_vjt',
    'nom_vjt',
    'cargo',
    'funcao'
]]


In [15]:
df = df.copy()

df['motivo'] = (
    df['motivo']
    .fillna('Sem motivo')
    .astype(str)
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8')
    .str.replace(r'[\x00-\x1F\x7F-\x9F]', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

df.loc[df['motivo'] == '', 'motivo'] = 'Sem motivo'

dim_mtv = (
    df[['motivo']]
    .drop_duplicates()
    .reset_index(drop=True)
)


In [16]:
dim_tmp.to_sql('dim_tmp', engine, schema='dw_gold', if_exists='append', index=False)
dim_org_sup.to_sql('dim_org_sup', engine, schema='dw_gold', if_exists='append', index=False)
dim_org_sol.to_sql('dim_org_sol', engine, schema='dw_gold', if_exists='append', index=False)
dim_vjt.to_sql('dim_vjt', engine, schema='dw_gold', if_exists='append', index=False)
dim_mtv.to_sql('dim_mtv', engine, schema='dw_gold', if_exists='append', index=False)

544

In [17]:
dim_tmp_db = pd.read_sql("""
    SELECT srk_tmp, dat_ini
    FROM dw_gold.dim_tmp
""", engine)

dim_org_sup_db = pd.read_sql("""
    SELECT srk_org_sup, cod_org_sup
    FROM dw_gold.dim_org_sup
""", engine)

dim_org_sol_db = pd.read_sql("""
    SELECT srk_org_sol, cod_org_sol
    FROM dw_gold.dim_org_sol
""", engine)

dim_vjt_db = pd.read_sql("""
    SELECT srk_vjt, cpf_vjt, nom_vjt, cargo, funcao
    FROM dw_gold.dim_vjt
""", engine)

dim_mtv_db = pd.read_sql("""
    SELECT srk_mtv, motivo
    FROM dw_gold.dim_mtv
""", engine)

In [18]:
print(df['dat_ini'].dtype)
print(dim_tmp_db['dat_ini'].dtype)


datetime64[s]
object


In [19]:
df['dat_ini'] = pd.to_datetime(df['dat_ini'])
dim_tmp_db['dat_ini'] = pd.to_datetime(dim_tmp_db['dat_ini'])


In [20]:
fat_vgm = (
    df
    .merge(dim_tmp_db, on='dat_ini', how='left')
    .merge(dim_org_sup_db, on='cod_org_sup', how='left')
    .merge(dim_org_sol_db, on='cod_org_sol', how='left')
    .merge(
        dim_vjt_db,
        on=['cpf_vjt', 'nom_vjt', 'cargo', 'funcao'],
        how='left'
    )
    .merge(dim_mtv_db, on='motivo', how='left')
)

assert fat_vgm.isna().sum().sum() == 0, "Existem valores NULL na tabela fato"
fat_vgm.isna().sum()

ide_pro_via    0
nmr_ppt_pcd    0
situacao       0
via_urg        0
jus_urg_via    0
cod_org_sup    0
nom_org_sup    0
cod_org_sol    0
nom_org_sol    0
cpf_vjt        0
nom_vjt        0
cargo          0
funcao         0
dat_ini        0
dat_fim        0
destinos       0
motivo         0
vlr_dia        0
vlr_psg        0
vlr_dvl        0
vlr_out        0
tot_gas        0
mes_ida        0
mes_vol        0
drc_vgm_dia    0
cst_med_dia    0
srk_tmp        0
srk_org_sup    0
srk_org_sol    0
srk_vjt        0
srk_mtv        0
dtype: int64

In [21]:
fat_vgm['tot_gas'] = (
    fat_vgm['vlr_dia'] +
    fat_vgm['vlr_psg'] +
    fat_vgm['vlr_out'] -
    fat_vgm['vlr_dvl']
)

fat_vgm['drc_vgm_dia'] = (
    (fat_vgm['dat_fim'] - fat_vgm['dat_ini']).dt.days + 1
)

fat_vgm['cst_med_dia'] = (
    fat_vgm['tot_gas'] / fat_vgm['drc_vgm_dia']
)

fat_vgm = fat_vgm[[
    'srk_tmp',
    'srk_org_sup',
    'srk_org_sol',
    'srk_vjt',
    'srk_mtv',
    'vlr_dia',
    'vlr_psg',
    'vlr_out',
    'vlr_dvl',
    'tot_gas',
    'drc_vgm_dia',
    'cst_med_dia'
]]

assert fat_vgm.isna().sum().sum() == 0, \
       "Existem valores NULL na tabela fato"

In [22]:
fat_vgm.to_sql(
    'fat_vgm',
    engine,
    schema='dw_gold',
    if_exists='append',
    index=False
)

531