In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import requests

In [7]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (optional)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
)

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

In [8]:
url = "https://www1.upme.gov.co/Entornoinstitucional/Biblioteca-juridica/Paginas/Resoluciones-UPME-Energia-electrica.aspx"
driver.get(url)

    # Wait until a specific element is present (modify as needed)
WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Get the page source
page_source = driver.page_source

    # Parse the HTML with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")

In [10]:
pdf_links = [
    element.get_attribute("href")
    for element in driver.find_elements(By.CSS_SELECTOR, "a.ms-srch-item-link")
]
pdf_links

['https://www1.upme.gov.co/Normatividad/1198_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/977_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/979_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/727_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/712_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/705_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/501_2024.pdf',
 'https://www1.upme.gov.co/Normatividad/457_2024.pdf']

In [11]:
driver.quit()

In [19]:
pdf_links[0][-26:]

'Normatividad/1198_2024.pdf'

In [12]:
# Directory to save documents
DOWNLOAD_DIR = "../data"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [33]:
def download_document(url, filename):
    """Download a document and save it locally."""
    try:
        filename = filename.replace("/", "-")
        response = requests.get(url, timeout=15)
        response.raise_for_status()

        filepath = os.path.join("../data/", filename)
        with open(filepath, "wb") as file:
            file.write(response.content)

    except Exception as e:
        print(f"Error downloading {url}: {e}")

In [None]:
for doc in pdf_links:
    filename = doc[-26:]
    download_document(doc, filename)

In [35]:
month_index_translation = {
    "DIC": "DEC",
    "NOV": "NOV",
    "OCT": "OCT",
    "SEP": "SEP",
    "AGO": "AUG",
    "JUL": "JUL",
    "JUN": "JUN",
    "MAY": "MAY",
    "ABR": "APR",
    "MAR": "MAR",
    "FEB": "FEB",
    "ENE": "JAN",
}

In [36]:
import unicodedata

def remove_accents(text):
    return "".join(
        c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c)
    )

In [52]:
import PyPDF2
import re
from datetime import datetime

def extract_text_from_pdf(pdf_path):
    """Extrae todo el texto del PDF."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        full_text = [p.extract_text() for p in reader.pages]
        full_text = [p for p in full_text if p]  # Remove empty paragraphs
        full_text = [remove_accents(p) for p in full_text]
        # for page in reader.pages:
        #     text += page.extract_text()
    return "\n".join(full_text)

def extract_metadata(text):
    """Extrae número de resolución, fecha y concepto del texto."""
    resolution_number = None
    date = None
    concept = None

    # Buscar número de resolución (ejemplo: RESOLUCIÓN No. 000457 de 2024)
    res_match = re.search(r"RESOLUCIÓN\s+No\.\s+\d+\s+de\s+\d{4}", text, re.IGNORECASE)
    if res_match:
        resolution_number = res_match.group()
        resolution_number = remove_accents(resolution_number)

    # Buscar fecha (ejemplo: 19-06-2024)
    date_match = re.search(r"\d{2}-\d{2}-\d{4}", text)
    if date_match:
        date = date_match.group()
        date = datetime.strptime(date, "%d-%m-%Y").strftime("%Y-%m-%d")

    # Buscar concepto (usando comillas y heurística)
    concept_match = re.search(r"“([^”]+)”", text)
    if concept_match:
        concept = concept_match.group(1)
        concept = remove_accents(concept)

    return {
        "name": resolution_number,
        "resolution_date": date,
        "concept": concept,
        "process_date": datetime.now().strftime("%Y-%m-%d"),
    }

In [54]:
documents = [f for f in os.listdir("../data") if f.endswith(".pdf")]
resolutions = []
for document in documents:
    pdf_path = os.path.join("../data/", document)
    pdf_text = extract_text_from_pdf(pdf_path)
    metadata = extract_metadata(pdf_text)
    metadata["full_text"] = pdf_text
    resolutions.append(metadata)


In [57]:
# Save as JSON
import json

folder = "../data/processed"
file_name = "resolutions.json"
file_path = os.path.join(folder, file_name)
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(resolutions, f, indent=4)