# Imports

In [None]:
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from google.cloud import bigquery
import os
from datetime import date, datetime
import pandas as pd


True

In [70]:
"""
    parameters received from papermill

    these are the parameters that are passed to the script
    above is the default values
    they can be overridden by the user when running the script
"""

BASE_URL = "https://www.gov.br/anp/pt-br"
B100_SALES = f"{BASE_URL}/assuntos/distribuicao-e-revenda/comercializacao-de-biodiesel"
RAW_PATH = "data/raw/b100_sales"

bucket_name = os.getenv("GOOGLE_BUCKET_NAME")

In [71]:
"""
Get file download URL
"""

response = requests.get(B100_SALES, verify=False)
soup = BeautifulSoup(response.content, "html.parser")
if soup is None:
    raise Exception("Failed to retrieve data from the URL.")
title_text = 'Volumes Comercializados de Biodiesel'
title_h3 = soup.find('h3', string=title_text)
jump_p = title_h3.find_next_sibling()
jump_p = jump_p.find_next_sibling()
year = jump_p.find_next_sibling()

data_by_year = {}

current_year = year.get_text(strip=True)
data_by_year[current_year] = []

next_elem = year.find_next_sibling()
while next_elem:
    if next_elem.name == 'ul':
        for li in next_elem.find_all('li'):
            a_tag = li.find('a')
            if a_tag and 'href' in a_tag.attrs:
                link = a_tag['href']
                text = a_tag.get_text(strip=True)
                li_text = li.get_text(strip=True)


                start = li_text.find('Atualizado em ')
                if start != -1:
                    li_text = li_text[start + len('Atualizado em '):-1].strip()

                data_by_year[current_year].append({'text': text, 'link': link, 'updated_date': li_text})
            else:
                print("Elemento <li> nÃ£o contÃ©m um link.")
        next_elem = next_elem.find_next_sibling()
    else:
        if next_elem.name == 'h3' and next_elem.get_text(strip=True).isdigit():
            current_year = next_elem.get_text(strip=True)
            if current_year not in data_by_year:
                data_by_year[current_year] = []
            next_elem = next_elem.find_next_sibling()
            continue
        else:
            break




In [72]:
current_year = str(datetime.now().year)
download_url =  data_by_year[current_year][0]['link']

response = requests.get(download_url, verify=False)
file_name = download_url.split("/")[-1]
if response.status_code != 200:
    raise Exception(f"Falha ao baixar o arquivo: {response.status_code}")

file_bytes = response.content
file_buffer = BytesIO(file_bytes)



In [73]:
"""
    - read the data from the file
    - convert it to a pandas dataframe
    - rename columns to lowercase and snake_case

    B100_BRONZE_COLUMNS_MAPPING is a dictionary that maps the columns in the Excel file to the columns in the DataFrame
"""
B100_BRONZE_COLUMNS_MAPPING = {
    "2023": {
        "Mês/Ano": "vb100_dat_compra",
        "Raiz\nCNPJ": "vb100_txt_cnpj",
        "Razão Social": "vb100_txt_razao_social",
        "Quantidade\nde Produto\n(m³)": "vb100_qtd_volume"
    },
    "2024": {
        "Data": "vb100_dat_compra",
        "Raiz de CNPJ do Distribuidor": "vb100_txt_cnpj",
        "Razão Social do Distribuidor": "vb100_txt_razao_social",
        "Razão Social do Produtor": "vb100_txt_produtor",
        "CNPJ do Produtor": "vb100_txt_produtor_cnpj",
        "Volume (m3)": "vb100_qtd_volume"
    }
}

map_to_use = B100_BRONZE_COLUMNS_MAPPING.get(current_year, B100_BRONZE_COLUMNS_MAPPING["2024"])
df = pd.read_excel(file_buffer, header=2, usecols=lambda x: 'Unnamed' not in x)
df = df.rename(columns=map_to_use)

In [None]:
"""
    change columns data types
"""

df['vb100_dat_compra'] = pd.to_datetime(df['vb100_dat_compra'], format='%Y%m')
df["vb100_txt_cnpj"] = df["vb100_txt_cnpj"].astype(str).str.zfill(8)
df['vb100_dat_compra'] = df['vb100_dat_compra'].dt.date
df['vb100_qtd_volume'] = df['vb100_qtd_volume'].astype(float)
df['vb100_dat_criacao'] = pd.Timestamp.now(tz='America/Sao_Paulo')

if int(current_year) < 2024:
    df["vb100_txt_produtor"] = None
    df["vb100_txt_produtor_cnpj"] = None
else:
    df["vb100_txt_produtor_cnpj"] = df["vb100_txt_produtor_cnpj"].astype(str).str.zfill(14)

datetime64[us, America/Sao_Paulo]


In [None]:
"""
    insert data into BigQuery with date-based partitioning
"""

client = bigquery.Client()
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
bq_dataset = "rw_ext_anp"
table_name = "venda_b100"

table_id = f"{project_id}.{bq_dataset}.{table_name}"

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)


partition_key = date.today().strftime('%Y%m%d')

partitioned_table_id = f"{table_id}${partition_key}"
print(f"Inserting data for partition: {partition_key}")

try:
    job = client.load_table_from_dataframe(
        df, partitioned_table_id, job_config=job_config
    )
    job.result()
    print(f"  Data for {partition_key} inserted successfully.")
except Exception as e:
    print(f"  Error inserting data for {partition_key}: {str(e)}")

print("Data insertion completed!")


Inserting data for partition: 20250703
  Data for 20250703 inserted successfully.
Data insertion completed!
