In [45]:
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from dotenv import load_dotenv

# --- Load Environment Variables ---
# This will load the variables from a .env file in the same directory
load_dotenv()

# --- Database Configuration ---
# Read configuration from environment variables.
DB_USER = os.getenv("MYSQL_USER")
DB_PASSWORD = os.getenv("MYSQL_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("MYSQL_DATABASE")

# --- Create Database Connection URL ---
# The format for the connection string is:
# dialect+driver://username:password@host:port/database
# We are using the 'mysqlconnector' driver.
DATABASE_URL = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"

engine = create_engine(DATABASE_URL)
print("SQLAlchemy engine created successfully.")

if engine:
    connection = engine.connect()
    result = connection.execute(text("SELECT VERSION()"))
    db_version = result.scalar()
    print(db_version)

SQLAlchemy engine created successfully.
8.0.43


In [28]:
query = "SELECT * FROM avisos WHERE id_rubro in(1110,1130);"
df = pd.read_sql(query, connection)

In [29]:
df.head()

Unnamed: 0,id,aviso_id,seccion,sociedad,rubro,id_rubro,fecha_publicacion,detalle_aviso,crawled_at
0,321,A322,Segunda sección,TECNOLOGIA Y CABLEADOS,CONSTITUCION SA,1110,2011-01-03,"Por Esc. 255 del 21/12.10, Carlos Alberto Koga...",2025-09-20 02:36:44
1,323,A324,Segunda sección,BOMBAS DE HORMIGON,CONSTITUCION SA,1110,2011-01-03,"Por Esc. 247 del 16/12/10, Rodolfo Gontek, 27/...",2025-09-20 02:36:45
2,325,A326,Segunda sección,BHRISA,CONSTITUCION SA,1110,2011-01-03,"Agustín Rodolfo Spotorno, con DNI 22.501.187, ...",2025-09-20 02:36:45
3,344,A345,Segunda sección,BELFIL,CONSTITUCION SA,1110,2011-01-03,"Carlos Enrique Bilevich, dni: 17029874 y Gabri...",2025-09-20 02:36:49
4,367,A368,Segunda sección,GRUPO SAMIRA,CONSTITUCION SA,1110,2011-01-03,"1) Rosela Beatriz Diaz, 18-7-79, DNI 27368918,...",2025-09-20 02:36:54


In [40]:
with pd.option_context('display.max_colwidth', None):
    print(df["detalle_aviso"].head(100))


0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [41]:
import langextract as lx
import textwrap

prompt = textwrap.dedent("""\
    You are an expert legal assistant specialized in analyzing corporate documents from Argentina's Official Bulletin. 
    Your task is to extract structured information about newly incorporated companies (Sociedades Anónimas) from the provided text.
    For each notice, identify and extract the classes of information.""")

examples = [
    lx.data.ExampleData(
        text="Por escritura Nº 65, Fº 175, del 2/7/2025, Registro 1132 de Cap. Fed., se constituyó la sociedad: 1) CUMBRE INGENIERIA S.A. 2) Socios: Juan Ignacio SCHWERDT, nacido el 2/11/89, DNI 34.418.579, CUIT 20-34418579-5, Ing., domiciliado en Dr. Tomás de Anchorena 95, Pilar, Pcia. de Bs. As., y Juan Francisco SCHWERDT, nacido el 22/11/90, DNI 35.795.538, CUIT 20-35795538-7, empresario, domiciliado en Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A., ambos argentinos, hijos de Juan Agustín Schwerdt y Zulma Gladys Holzmann, y solteros. 3) Sede: Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A. 4) Plazo: 99 años. 5) Objeto: La sociedad tiene por objeto dedicarse, por cuenta propia o de terceros... (Full object text)... Las actividades que así lo requieran serán realizadas por profesionales con título habilitante en la materia. 6) Capital: $ 30.000.000, representado por 30.000.000 acciones... Juan Ignacio SCHWERDT suscribe 28.500.000 acciones; Juan Francisco SCHWERDT suscribe 1.500.000 acciones... 10) Directorio: Presidente: Juan Ignacio SCHWERDT; Director Suplente: Juan Francisco SCHWERDT. 11) Domicilio especial directores: Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A. ... 13) Cierre ejercicio: 30/6. Autorizado según instrumento público Esc. Nº 65 de fecha 02/07/2025 Reg. Nº 1132 Roberto Wilkinson - Matrícula: 5241 C.E.C.B.A. e. 10/07/2025 N° 48233/25 v. 10/07/2025",
        extractions=[
            lx.data.Extraction(extraction_class="company_name", extraction_text="CUMBRE INGENIERIA S.A."),
            lx.data.Extraction(extraction_class="incorporation_date", extraction_text="2/7/2025", attributes={"normalized_date": "2025-07-02"}),
            lx.data.Extraction(extraction_class="company_address", extraction_text="Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A."),
            lx.data.Extraction(extraction_class="duration_years", extraction_text="99 años", attributes={"value": 99}),
            lx.data.Extraction(extraction_class="capital_amount", extraction_text="$ 30.000.000", attributes={"value": 30000000}),
            lx.data.Extraction(extraction_class="company_object", extraction_text="La sociedad tiene por objeto...en la materia."),
            lx.data.Extraction(extraction_class="fiscal_year_end", extraction_text="30/6"),
            lx.data.Extraction(
                extraction_class="notary_public",
                extraction_text="Roberto Wilkinson - Matrícula: 5241 C.E.C.B.A.",
                attributes={"name": "Roberto Wilkinson", "registration": "Matrícula: 5241 C.E.C.B.A."}
            ),
            lx.data.Extraction(
                extraction_class="partner",
                extraction_text="Juan Ignacio SCHWERDT, nacido el 2/11/89, DNI 34.418.579, CUIT 20-34418579-5, Ing., domiciliado en Dr. Tomás de Anchorena 95, Pilar, Pcia. de Bs. As.",
                attributes={
                    "full_name": "Juan Ignacio SCHWERDT", "dni": "34.418.579", "cuit": "20-34418579-5",
                    "address": "Dr. Tomás de Anchorena 95, Pilar, Pcia. de Bs. As.", "nationality": "argentino",
                    "birth_date": "1989-11-02", "marital_status": "soltero", "profession": "Ing.", "shares_subscribed": 28500000
                }
            ),
            lx.data.Extraction(
                extraction_class="partner",
                extraction_text="Juan Francisco SCHWERDT, nacido el 22/11/90, DNI 35.795.538, CUIT 20-35795538-7, empresario, domiciliado en Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A.",
                attributes={
                    "full_name": "Juan Francisco SCHWERDT", "dni": "35.795.538", "cuit": "20-35795538-7",
                    "address": "Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A.", "nationality": "argentino",
                    "birth_date": "1990-11-22", "marital_status": "soltero", "profession": "empresario", "shares_subscribed": 1500000
                }
            ),
            lx.data.Extraction(
                extraction_class="director",
                extraction_text="Presidente: Juan Ignacio SCHWERDT",
                attributes={"full_name": "Juan Ignacio SCHWERDT", "role": "Presidente", "special_address": "Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A."}
            ),
            lx.data.Extraction(
                extraction_class="director",
                extraction_text="Director Suplente: Juan Francisco SCHWERDT",
                attributes={"full_name": "Juan Francisco SCHWERDT", "role": "Director Suplente", "special_address": "Av. Santa Fe 3755, piso 3º, dpto. B, C.A.B.A."}
            ),
            lx.data.Extraction(extraction_class="publication_details", extraction_text="e. 10/07/2025 N° 48233/25 v. 10/07/2025"),
        ]
    ),
    lx.data.ExampleData(
        text="1) Francisco MAIOLI, nacido el 11/03/2000, DNI 42.564.527...Gabriel Alejandro MAIOLI, nacido el 16/03/1974, DNI 23.807.266...domiciliados en la Avenida Callao 1137 CABA; 2) 04.07.2025. 3) TEATRO ARGENTINO 1 S.A.; 4) Avenida Callao 1137 CABA; 5) La sociedad tiene por objeto...(Full object text)... 6) 99 años. 7) $ 30.000.000... Gabriel Alejandro MAIOLI, suscribe 24.000.000; Francisco MAIOLI, suscribe 6.000.000; 8) Directorio: ... Se designó presidente a Gabriel Alejandro Maioli; Director Suplente: Francisco Maioli...fijan domicilio especial en la sede social. ... 10) 30/06. Autorizado según instrumento público Esc. Nº 147 de fecha 04/07/2025 Reg. Nº 294 JUDITH LAURA RAQUEL CHEMAYA - Matrícula: 5613 C.E.C.B.A. e. 10/07/2025 N° 48293/25 v. 10/07/2025",
        extractions=[
            lx.data.Extraction(extraction_class="company_name", extraction_text="TEATRO ARGENTINO 1 S.A."),
            lx.data.Extraction(extraction_class="incorporation_date", extraction_text="04.07.2025", attributes={"normalized_date": "2025-07-04"}),
            lx.data.Extraction(extraction_class="company_address", extraction_text="Avenida Callao 1137 CABA"),
            lx.data.Extraction(extraction_class="duration_years", extraction_text="99 años", attributes={"value": 99}),
            lx.data.Extraction(extraction_class="capital_amount", extraction_text="$ 30.000.000", attributes={"value": 30000000}),
            lx.data.Extraction(extraction_class="company_object", extraction_text="La sociedad tiene por objeto...por este Estatuto."),
            lx.data.Extraction(extraction_class="fiscal_year_end", extraction_text="30/06"),
            lx.data.Extraction(
                extraction_class="notary_public",
                extraction_text="JUDITH LAURA RAQUEL CHEMAYA - Matrícula: 5613 C.E.C.B.A.",
                attributes={"name": "JUDITH LAURA RAQUEL CHEMAYA", "registration": "Matrícula: 5613 C.E.C.B.A."}
            ),
            lx.data.Extraction(
                extraction_class="partner",
                extraction_text="Francisco MAIOLI, nacido el 11/03/2000, DNI 42.564.527, CUIT 20-42564527-8, Licenciado en Economía",
                attributes={
                    "full_name": "Francisco MAIOLI", "dni": "42.564.527", "cuit": "20-42564527-8",
                    "address": "Avenida Callao 1137 CABA", "nationality": "argentino",
                    "birth_date": "2000-03-11", "marital_status": "soltero", "profession": "Licenciado en Economía", "shares_subscribed": 6000000
                }
            ),
            lx.data.Extraction(
                extraction_class="partner",
                extraction_text="Gabriel Alejandro MAIOLI, nacido el 16/03/1974, DNI 23.807.266, CUIT 20-23807266-3, empresario",
                attributes={
                    "full_name": "Gabriel Alejandro MAIOLI", "dni": "23.807.266", "cuit": "20-23807266-3",
                    "address": "Avenida Callao 1137 CABA", "nationality": "argentino",
                    "birth_date": "1974-03-16", "marital_status": "soltero", "profession": "empresario", "shares_subscribed": 24000000
                }
            ),
            lx.data.Extraction(
                extraction_class="director",
                extraction_text="presidente a Gabriel Alejandro Maioli",
                attributes={"full_name": "Gabriel Alejandro Maioli", "role": "Presidente", "special_address": "Avenida Callao 1137 CABA"}
            ),
            lx.data.Extraction(
                extraction_class="director",
                extraction_text="Director Suplente: Francisco Maioli",
                attributes={"full_name": "Francisco Maioli", "role": "Director Suplente", "special_address": "Avenida Callao 1137 CABA"}
            ),
            lx.data.Extraction(extraction_class="publication_details", extraction_text="e. 10/07/2025 N° 48293/25 v. 10/07/2025"),
        ]
    )
]

In [46]:
result = lx.extract(
    text_or_documents=df["detalle_aviso"][15],
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)



In [49]:
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 1905.64 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 2.23k/2.23k [00:00<00:00, 4.43MB/s][0m

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m



