In [1]:
import requests
from bs4 import BeautifulSoup
import os

In [2]:
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        }

url = "https://creg.gov.co/loader.php?lServicio=Documentos&lFuncion=infoCategoriaConsumo&tipo=RE"
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
response.status_code

200

In [20]:
documents = []
for link in soup.find_all("a", href=True):
    href = link["href"]
    if "ControlAdmin/BajarArchivo".lower() in href.lower():  # Filter relevant links
        doc_title = link.text.strip()
        documents.append({"title": doc_title, "url": href})

In [4]:
# Directory to save documents
DOWNLOAD_DIR = "../data"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [21]:
def download_document(url, filename):
    """Download a document and save it locally."""
    try:
        filename = filename.replace("\\", "")
        response = requests.get(url, timeout=15)
        response.raise_for_status()

        filepath = os.path.join(DOWNLOAD_DIR, filename)
        with open(filepath, "wb") as file:
            file.write(response.content)

    except Exception as e:
        print(f"Error downloading {url}: {e}")

In [22]:
for doc in documents:
    filename = f"{doc['url'][-12:]}.docx"
    download_document(doc["url"], filename)

Error downloading https://creg.analitica.com.co/AZDigital/ControlAdmin/BajarArchivo.php?ArId=391827: [Errno 22] Invalid argument: '../data\\?ArId=391827.docx'


In [16]:
month_index_translation = {
    "DIC": "DEC",
    "NOV": "NOV",
    "OCT": "OCT",
    "SEP": "SEP",
    "AGO": "AUG",
    "JUL": "JUL",
    "JUN": "JUN",
    "MAY": "MAY",
    "ABR": "APR",
    "MAR": "MAR",
    "FEB": "FEB",
    "ENE": "JAN",
}

In [32]:
import unicodedata

def remove_accents(text):
    return "".join(
        c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c)
    )

In [31]:
import docx
from datetime import datetime


filename = "ArId=1240700.docx"
doc = docx.Document(f"../data/{filename}")
resolution_metadata = {}
resolution_values_para = [0,1]

resolution_name = doc.paragraphs[resolution_values_para[0]].text
while not resolution_name.startswith("RESOLUCIÓN"):
    resolution_values_para[0] += 1
    resolution_name = doc.paragraphs[resolution_values_para[0]].text
    resolution_name = remove_accents(resolution_name)

resolution_date = doc.paragraphs[resolution_values_para[1]].text
while not resolution_date.startswith("("):
    resolution_values_para[1] += 1
    resolution_date = doc.paragraphs[resolution_values_para[1]].text

resolution_concept = doc.paragraphs[resolution_values_para[1]+1].text
while resolution_concept == "":
    resolution_values_para[1] += 1
    resolution_concept = doc.paragraphs[resolution_values_para[1]+1].text
    resolution_concept = remove_accents(resolution_concept)

print(resolution_name)
print(resolution_date)
print(resolution_concept)
resolution_date = resolution_date.replace("(", "").replace(")", "").replace(".", " ")
day, month, year = resolution_date.split()
month_en = month_index_translation.get(month, month)
date_english = f"{day} {month_en} {year}"
resolution_date = datetime.strptime(date_english, "%d %b %Y")
resolution_concept = doc.paragraphs[10].text
full_text = [p.text for p in doc.paragraphs]
full_text = [p for p in full_text if p]  # Remove empty paragraphs
full_text = [remove_accents(p) for p in full_text]
resolution_metadata = {
    "name": resolution_name,
    "resolution_date": resolution_date.strftime("%Y-%m-%d"),
    "concept": resolution_concept,
    "full_text": "\n".join(full_text),
    "process_date": datetime.now().strftime("%Y-%m-%d"),
}
resolution_metadata

RESOLUCIÓN No. 502 073 DE 2024
(11 JUL.2024)



{'name': 'RESOLUCIÓN No. 502 073 DE 2024',
 'resolution_date': '2024-07-11',
 'concept': '',
 'full_text': 'Ministerio de Minas y Energía\nCOMISIÓN DE REGULACIÓN DE ENERGÍA Y GAS\nRESOLUCIÓN No. 502 073 DE 2024\n(11 JUL.2024)\nPor la cual se resuelve el recurso de reposición interpuesto por la empresa PROVIGAS COLOMBIA S.A. E.S.P. contra la resolución CREG 502 013 de 2023.\nLA COMISIÓN DE REGULACIÓN DE ENERGÍA Y GAS\nEn ejercicio de sus atribuciones constitucionales y legales, en especial las conferidas por la Ley 142 de 1994 y, en desarrollo de los Decretos 2253 de 1994 y 1260 de 2013, y\nCONSIDERANDO QUE:\nLa Comisión de Regulación de Energía y Gas, en su sesión No. 1244 del 17 de febrero de 2023, aprobó expedir la Resolución CREG 502 012 de 2023, “Por el cual se aprueba el cargo de distribución por uso del sistema de distribución de Gas Natural Comprimido – GNC por redes de tubería para el mercado relevante especial conformado por las veredas de Buenos Aires, Caracol, Caucho, Guarum

In [17]:
# open a word document and get the content
import docx
from datetime import datetime

documents = [f for f in os.listdir("../data") if f.endswith(".docx")]
resolutions = []

for resolution in documents:
    try:
        print(resolution)
        doc = docx.Document(f"../data/{resolution}")
        resolution_metadata = {}
        resolution_values_para = [0,1]

        resolution_name = doc.paragraphs[resolution_values_para[0]].text
        while not resolution_name.startswith("RESOLUCIÓN"):
            resolution_values_para[0] += 1
            resolution_name = doc.paragraphs[resolution_values_para[0]].text

        resolution_date = doc.paragraphs[resolution_values_para[1]].text
        while not resolution_date.startswith("("):
            resolution_values_para[1] += 1
            resolution_date = doc.paragraphs[resolution_values_para[1]].text

        resolution_date = resolution_date.replace("(", "").replace(")", "").replace(".", " ")
        day, month, year = resolution_date.split()
        month_en = month_index_translation.get(month, month)
        date_english = f"{day} {month_en} {year}"
        resolution_date = datetime.strptime(date_english, "%d %b %Y")
        resolution_concept = doc.paragraphs[10].text
        full_text = [p.text for p in doc.paragraphs]
        full_text = [p for p in full_text if p]  # Remove empty paragraphs
        resolution_metadata = {
            "name": resolution_name,
            "resolution_date": resolution_date.strftime("%Y-%m-%d"),
            "concept": resolution_concept,
            "full_text": "\n".join(full_text),
            "process_date": datetime.now().strftime("%Y-%m-%d"),
        }
        resolutions.append(resolution_metadata)
        # delete the document
        os.remove(f"../data/{resolution}")
    except Exception as e:
        print(f"Error processing {doc}: {e}")
        # move doc to another folder
        os.rename(f"../data/{resolution}", f"../data/to_check/{resolution}")
        continue

ArId=1137299.docx
ArId=1146929.docx
Error processing <docx.document.Document object at 0x0000018E782087D0>: Package not found at '../data/ArId=1146929.docx'
ArId=1240700.docx
ArId=1247707.docx
ArId=1260044.docx
Error processing <docx.document.Document object at 0x0000018E77CC3690>: Package not found at '../data/ArId=1260044.docx'
ArId=1278579.docx
ArId=1285714.docx
Error processing <docx.document.Document object at 0x0000018E77CC35B0>: Package not found at '../data/ArId=1285714.docx'
ArId=1301602.docx
ArId=1307671.docx
ArId=1309129.docx
ArId=1309942.docx
ArId=1312179.docx
Error processing <docx.document.Document object at 0x0000018E78211180>: Package not found at '../data/ArId=1312179.docx'
ArId=1312315.docx
Error processing <docx.document.Document object at 0x0000018E78211180>: Package not found at '../data/ArId=1312315.docx'
ArId=1317266.docx
Error processing <docx.document.Document object at 0x0000018E78211180>: Package not found at '../data/ArId=1317266.docx'
ArId=1318022.docx
Erro

In [18]:
# save the list of dicctionaries
import json

file_name = f"resolutions_processed_{datetime.now().strftime("%Y-%m-%d")}.json"
folder = "../data/processed"
file_path = os.path.join(folder, file_name)

# Save as JSON
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(resolutions, f, indent=4)