# Imports

In [6]:
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from google.cloud import bigquery
import os
from datetime import date, datetime
import pandas as pd


In [7]:
"""
    parameters received from papermill

    these are the parameters that are passed to the script
    above is the default values
    they can be overridden by the user when running the script
"""

BASE_URL = "https://www.gov.br/anp/pt-br"
B100_SALES = f"{BASE_URL}/assuntos/distribuicao-e-revenda/comercializacao-de-biodiesel"
RAW_PATH = "data/raw/b100_sales"

bucket_name = os.getenv("GOOGLE_BUCKET_NAME")

In [8]:
"""
Get file download URL
"""

response = requests.get(B100_SALES, verify=False)
soup = BeautifulSoup(response.content, "html.parser")
if soup is None:
    raise Exception("Failed to retrieve data from the URL.")
title_text = 'Volumes Comercializados de Biodiesel'
title_h3 = soup.find('h3', string=title_text)
jump_p = title_h3.find_next_sibling()
jump_p = jump_p.find_next_sibling()
year = jump_p.find_next_sibling()

data_by_year = {}

current_year = year.get_text(strip=True)
data_by_year[current_year] = []

next_elem = year.find_next_sibling()
while next_elem:
    if next_elem.name == 'ul':
        for li in next_elem.find_all('li'):
            a_tag = li.find('a')
            if a_tag and 'href' in a_tag.attrs:
                link = a_tag['href']
                text = a_tag.get_text(strip=True)
                li_text = li.get_text(strip=True)


                start = li_text.find('Atualizado em ')
                if start != -1:
                    li_text = li_text[start + len('Atualizado em '):-1].strip()

                data_by_year[current_year].append({'text': text, 'link': link, 'updated_date': li_text})
            else:
                print("Elemento <li> nÃ£o contÃ©m um link.")
        next_elem = next_elem.find_next_sibling()
    else:
        if next_elem.name == 'h3' and next_elem.get_text(strip=True).isdigit():
            current_year = next_elem.get_text(strip=True)
            if current_year not in data_by_year:
                data_by_year[current_year] = []
            next_elem = next_elem.find_next_sibling()
            continue
        else:
            break




In [9]:
current_years = {
	"2023",
	"2024",
	"2025"
}
all_dataframes = []
for year in current_years:
    if year in data_by_year and data_by_year[year]:
        print(f"Processing year: {year}")

        download_url = data_by_year[year][0]['link']
        response = requests.get(download_url, verify=False)

        if response.status_code != 200:
            print(f"Failed to download file for year {year}: {response.status_code}")
            continue

        file_bytes = response.content
        file_buffer = BytesIO(file_bytes)

        B100_BRONZE_COLUMNS_MAPPING = {
            "2023": {
                "Mês/Ano": "vb100_dat_compra",
                "Raiz\nCNPJ": "vb100_num_cnpj",
                "Razão Social": "vb100_txt_razao_social",
                "Quantidade\nde Produto\n(m³)": "vb100_qtd_volume"
            },
            "2024": {
                "Data": "vb100_dat_compra",
                "Raiz de CNPJ do Distribuidor": "vb100_num_cnpj",
                "Razão Social do Distribuidor": "vb100_txt_razao_social",
                "Razão Social do Produtor": "vb100_nom_produtor",
                "CNPJ do Produtor": "vb100_num_produtor_cnpj",
                "Volume (m3)": "vb100_qtd_volume"
            },
            "2025": {
                "Data": "vb100_dat_compra",
                "Raiz de CNPJ do Distribuidor": "vb100_num_cnpj",
                "Razão Social do Distribuidor": "vb100_txt_razao_social",
                "Razão Social do Produtor": "vb100_nom_produtor",
                "CNPJ do Produtor": "vb100_num_produtor_cnpj",
                "Volume (m3)": "vb100_qtd_volume"
            }
        }

        B100_BRONZE_COLUMNS_MAPPING = {
            "2023": {
                "Mês/Ano": "data_compra",
                "Raiz\nCNPJ": "raiz_cnpj_distribuidor",
                "Razão Social": "razao_social_distribuidor",
                "Quantidade\nde Produto\n(m³)": "qtd_de_produto_m3"
            },
            "2024": {
                "Data": "data_compra",
                "Raiz de CNPJ do Distribuidor": "raiz_cnpj_distribuidor",
                "Razão Social do Distribuidor": "razao_social_distribuidor",
                "Razão Social do Produtor": "razao_social_produtor",
                "CNPJ do Produtor": "cnpj_produtor",
                "Volume (m3)": "volume_m3"
            },
            "2025": {
                "Data": "data_compra",
                "Raiz de CNPJ do Distribuidor": "raiz_cnpj_distribuidor",
                "Razão Social do Distribuidor": "razao_social_distribuidor",
                "Razão Social do Produtor": "razao_social_produtor",
                "CNPJ do Produtor": "cnpj_produtor",
                "Volume (m3)": "volume_m3"
            }
        }

        map_to_use = B100_BRONZE_COLUMNS_MAPPING.get(year, B100_BRONZE_COLUMNS_MAPPING["2024"])

        try:
            df_year = pd.read_excel(file_buffer, header=2, usecols=lambda x: 'Unnamed' not in x)
            df_year = df_year.rename(columns=map_to_use)

            df_year['data_compra'] = pd.to_datetime(df_year['data_compra'], format='%Y%m')
            df_year["raiz_cnpj_distribuidor"] = df_year["raiz_cnpj_distribuidor"].astype(str).str.zfill(8)
            df_year['data_compra'] = df_year['data_compra'].dt.date
            df_year['volume_m3'] = df_year['volume_m3'].astype(float)
            df_year['data_criacao'] = pd.Timestamp.now(tz='America/Sao_Paulo')

            if int(year) < 2024:
                df_year["razao_social_produtor"] = None
                df_year["cnpj_produtor"] = None
            else:
                df_year["cnpj_produtor"] = df_year["cnpj_produtor"].astype(str).str.zfill(14)

            all_dataframes.append(df_year)
            print(f"Successfully processed {len(df_year)} rows for year {year}")

        except Exception as e:
            print(f"Error processing data for year {year}: {str(e)}")
    else:
        print(f"No data found for year {year}")

if all_dataframes:
    df = pd.concat(all_dataframes, ignore_index=True)
    print(f"Combined dataset has {len(df)} total rows")
else:
    raise Exception("No data was processed for any year")

Processing year: 2024




Error processing data for year 2024: 'raiz_cnpj_distribuidor'
Processing year: 2025




Successfully processed 2878 rows for year 2025
Processing year: 2023
Error processing data for year 2023: 'volume_m3'
Combined dataset has 2878 total rows




In [10]:
"""
    change columns data types
"""

df['data_compra'] = pd.to_datetime(df['data_compra'], format='%Y%m')
df["raiz_cnpj_distribuidor"] = df["raiz_cnpj_distribuidor"].astype(str).str.zfill(8)
df['data_compra'] = df['data_compra'].dt.date
df['volume_m3'] = df['volume_m3'].astype(float)
df['data_criacao'] = pd.Timestamp.now(tz='America/Sao_Paulo')

if int(current_year) < 2024:
    df["razao_social_produtor"] = None
    df["cnpj_produtor"] = None
else:
    df["cnpj_produtor"] = df["cnpj_produtor"].astype(str).str.zfill(14)

In [11]:
"""
    insert data into BigQuery with date-based partitioning
"""

client = bigquery.Client()
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
bq_dataset = "rw_ext_anp"
table_name = "venda_b100"

table_id = f"{project_id}.{bq_dataset}.{table_name}"

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
)

partition_key = date.today().strftime('%Y%m%d')

partitioned_table_id = f"{table_id}${partition_key}"
print(f"Inserting data for partition: {partition_key}")
print(f"Total rows to insert: {len(df)}")

try:
    job = client.load_table_from_dataframe(
        df, partitioned_table_id, job_config=job_config
    )
    job.result()
    print(f"Data for {partition_key} inserted successfully.")

    df['year'] = pd.to_datetime(df['vb100_dat_compra']).dt.year
    summary = df.groupby('year').size()
    print("Data summary by year:")
    for year, count in summary.items():
        print(f"  {year}: {count} rows")

except Exception as e:
    print(f"Error inserting data for {partition_key}: {str(e)}")

print("Data insertion completed!")

Inserting data for partition: 20250716
Total rows to insert: 2878
Error inserting data for 20250716: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/None/datasets/rw_ext_anp/tables/venda_b100$20250716?prettyPrint=false: Invalid resource name projects/None; Project id: None
Data insertion completed!
