# Extract congeneres sales data
Extracts congeneres data from ANP page, downloads the ZIP file if available, and save it in BiqQuery.

In [None]:
import re
import os
import requests
import zipfile
import pandas as pd
from bs4 import BeautifulSoup
from io import BytesIO
from datetime import date
from google.cloud import bigquery
from dotenv import load_dotenv

load_dotenv()

In [None]:
URL = "https://www.gov.br/anp/pt-br/centrais-de-conteudo/paineis-dinamicos-da-anp/paineis-dinamicos-do-abastecimento/painel-dinamico-da-logistica-do-abastecimento-nacional-de-combustiveis"

In [None]:
response = requests.get(URL, verify=False)
if response.status_code == 200:
    page = BeautifulSoup(response.content, "html.parser")
else:
    print(f"Failed to retrieve data: {response.status_code}")


pattern = re.compile(r"Consulte\s+aqu.*Logística", re.IGNORECASE)
a_tag = page.find('a', string=pattern)
if a_tag and 'href' in a_tag.attrs:

    link = a_tag['href']
    text = a_tag.get_text(strip=True)
    li_text = a_tag.get_text(strip=True)

    panel = {
        'text': text,
        'link': link,
    }
else:
    panel = None


data_link = page.find('a', string='Veja também a base dados do painel')
if data_link and 'href' in data_link.attrs:
    link = data_link['href']
    text = data_link.get_text(strip=True)

    updated_data = data_link.next_sibling
    if updated_data:
        li_text = updated_data.get_text(strip=True).split("em ")[1][0:-1].strip()
    else:
        li_text = "Data não disponível"

    data = {
        'text': text,
        'link': link,
        'updated_date': li_text
    }
else:
    data = None


if data and data.get('link'):
    file_link_to_upload = data.get('link')

    response = requests.get(file_link_to_upload, verify=False)
    response.raise_for_status()
    zip_bytes = BytesIO(response.content)
    file_name = "logistics.zip"
else:
    print("No data link found.")


In [None]:
"""
Read the congeneres sales data from a zip file and convert it to a DataFrame.
"""

with zipfile.ZipFile(zip_bytes) as zf:
    for file_info in zf.infolist():
        with zf.open(file_info) as file:
            file_name = file_info.filename
            if file_name == "DADOS ABERTOS - LOGISTICA 03 - VENDAS CONG╥NERES DE DISTRIBUIDORES.csv":
                df = pd.read_csv(file, sep=";", encoding="latin1")
                break

In [None]:
"""
Process the DataFrame to rename columns and convert data types.
"""

df = df.rename(columns={
	"Período": "veco_dat_venda",
	"Produto": "veco_txt_produto",
	"UF Origem": "veco_txt_origem",
	"UF Destino": "veco_txt_destino",
	"Vendedor": "veco_txt_vendedor",
	"Comprador": "veco_txt_comprador",
	"Qtd  Produto Líquido": "veco_qtd_volume_1000m3",
})

df['veco_dat_venda'] = pd.to_datetime(df['veco_dat_venda'], format='%Y/%m').dt.date
df['veco_qtd_volume_1000m3'] = df['veco_qtd_volume_1000m3'].astype(float)
df['veco_dat_criacao'] = pd.Timestamp.now(tz='America/Sao_Paulo')


In [None]:
"""
    insert data into BigQuery with date-based partitioning
"""

client = bigquery.Client()
project_id = os.getenv("GOOGLE_PROJECT_ID")
bq_dataset = "rw_ext_anp"
table_name = "venda_congeneres"

table_id = f"{project_id}.{bq_dataset}.{table_name}"

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)


partition_key = date.today().strftime('%Y%m%d') 

partitioned_table_id = f"{table_id}${partition_key}"
print(f"Inserting data for partition: {partition_key}")
    
try:
    job = client.load_table_from_dataframe(
        df, partitioned_table_id, job_config=job_config
    )
    job.result()
    print(f"  Data for {partition_key} inserted successfully.")
except Exception as e:
    print(f"  Error inserting data for {partition_key}: {str(e)}")

print("Data insertion completed!")