# Find all the types of documents posible in supercias
### We need to define how many types documents and their frequency in supercias

In [None]:
packages = ['pymongo', 'gridfs', 'pandas', 'pymupdf', 'pytesseract', 'pdf2image', 'openai', 'python-dotenv']
for package in packages:
    try:
        __import__(package)
    except ImportError:
        !pip install {package}

import os
from pymongo import MongoClient
import pandas as pd
from functions.pdf_operations import PDFOperations
from functions.prompt_openai import query_openai
# endpoint = '10.0.10.5:27017' # use vpn
endpoint = '192.168.1.10:27017'  # use lan
#endpoint = '192.168.229.55:27017' # use house wifi
database = 'supercias'
collection = 'clean_companies'
# Connect to MongoDB
db = MongoClient('mongodb://'+endpoint)[database]
# get collection
collection = db[collection]
# where the pdfs will be stored 
pdfs_path = '../storage/pdfs/'
# pdf operator
pdf_op = PDFOperations(db, 'companies', pdfs_path)
#find pdf file
companies_cursor = collection.find()

Let seperate the docuemt by the type they can belong to: Generales, Economicos  and Judiciales

In [2]:
# now lets get all the documents for every company
companies = collection.find()
Documentos_generales = []
Documentos_economicos = []
Documentos_juridicos = []

for company in companies:
    # get the Documentos online
    documentos = company['Documentos']
    for documento in documentos['Generales']:
        Documentos_generales.append(documento)
    for documento in documentos['Economicos']:
        Documentos_economicos.append(documento)
    for documento in documentos['Juridicos']:
        Documentos_juridicos.append(documento)

Get all of the difrent kind of documentos generales into a df from 2010 and on

In [None]:
def proces_documents(documentos, label_fields):
    docs = []
    for documento in documentos:
        fields = documento.split('.')[0].split('_')
        new_doc = {}
        for index, field in enumerate(fields):
            new_doc[label_fields[index]] = field
        # add filename at the end
        new_doc['filename'] = documento
        # add new doc
        docs.append(new_doc)
    return docs

doc_generales_fields = ['ruc', 'tipo_doc', 'documento', 'fecha', 'nombre', 'cargo', 'other']
docs = proces_documents(Documentos_generales, doc_generales_fields)
df_generales = pd.DataFrame(docs)
df_generales['fecha'] = pd.to_datetime(df_generales['fecha'])
# get all of the docuemntos from 2010 and on
df_generales = df_generales[df_generales['fecha'] > '2010-01-01']
df_generales.head()

In [None]:
# 0791841030001_DocumentosGenerales_Formulario de actualización de datos_2022-08-04.pdf
# get all the ditribution of the documents
print(df_generales['documento'].value_counts())

We define the schema to be used by to scrap the Oficio Nombramiento Administradores

In [7]:

# extracted structure for 'Oficio Nombramiento Administradores'
schema = {
    "company": {
        "type": "object",
        "description": "the company that is being represented in the document",
        "ruc":  {
            "description": "the ruc of the company",
            "type": "string",
        },
        "name": {
            "description": "the name of the company",
            "type": "string",
        },
        "region": {
            "description": "the region of the company",
            "type": "string",
        },
        "date": {
            "description": "the date of the document",
            "type": "string",
        },
        "address": {
            "description": "the address of the company",
            "type": "string",
        },
        "phone": {
            "description": "the phone of the company",
            "type": "string",
        },
        "notary": {
            "description": "the notary of the document",
            "type": "string",
        },
        "lawyer": {
            "description": "the lawyer of the document",
            "type": "string",
        },
    },
    "appoitment":{
        "description": "this is the appointment of the new holder",
        "type": "object",
        "position": {
            "description": "the position of the new holder",
            "type": "string",
        },
        "term_years": {
            "description": "the term in years of the new holder",
            "type": "string",
        },
        "appointment_date": {
            "description": "the date of the appointment",
            "type": "string",
        },
        "previous_holder": {
            "description": "the previous holder of the position",
            "type": "object",
            "name": {
                "description": "the name of the previous holder",
                "type": "string",
            },
            "designated_date": {
                "description": "the date the previous holder was designated",
                "type": "string",
            },
            "id_number": {
                "description": "the id number of the previous holder, can be cedula, ruc, passport, etc",
                "type": "string",
            },
            "id_type": {
                "description": "the type of id of the previous holder",
                "type": "string",
            }
        },
        "new_holder": {
            "description": "the new holder of the position",
            "type": "object",
            "name": {
                "description": "the name of the new holder",
                "type": "string",
            },
            "designated_date": {
                "description": "the date the new holder was designated",
                "type": "string",
            },
            "id_number": {
                "description": "the id number of the new holder, can be cedula, ruc, passport, etc",
                "type": "string",
            },
            "id_type": {
                "description": "the type of id of the new holder",
                "type": "string",
            }
        },
    },
}

Now we evtract the text from every possible Oficio Nombramiento Administradores

We need to define what data we want to extact from every Oficio Nombramiento Administradores

In [8]:
import concurrent.futures
import threading
import logging
import traceback

# Configure the logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(threadName)s - %(message)s')

# Create a global lock for printing (if needed)
print_lock = threading.Lock()

# create a collection for Oficio Nombramiento Administradores un cmongoDB
collection = db['Oficio_Nombramiento_Administradores']
# get all the documents which have the document 'Oficio Nombramiento Administradores' and have the date containes '2023'
df_oficio_nombramiento_administradores = df_generales[(df_generales['documento'] == 'Oficio Nombramiento Administradores')]


def extract_data_from_pdf(row, collection):
    try: 
        # check if the file exists in the collection
        filename = row['filename']
        if collection.find_one({'filename': filename}):
            #print(f'{filename} already exists in the collection')
            return
        print(f'Extracting data from {filename}')
        text = pdf_op.extract_text(filename)
        # get the size of the text
        tokens = len(text.split())
        if tokens < 100:
            print(f'Text size: {tokens}')
            print(f'Not enough text in {filename}')
            return
        elif tokens > 16385:
            print(f'Text size: {tokens}')
            print(f'Too much text in {filename}')
            return
        # query open AI
        response = query_openai("extract the following data in the json format, with any N/A, from the following text: " + text, schema=schema)
        response = eval(response)
        #response = {}
        response['ruc'] = row['ruc']
        response['fecha'] = row['fecha']
        response['nombre'] = row['nombre']
        response['cargo'] = row['cargo']
        response['filename'] = filename
        print(response)
        # add to the collection
        collection.insert_one(response)
        print('-----------------------------------')
    except Exception as e:
        print(f'Error extracting data from {filename}: {e}')


Run Sequentially

In [None]:

# test with the filename '1790721450001_DocumentosGenerales_Oficio Nombramiento Administradores_2020-09-04_ALVERNIA DE CHACON IMELDA_PRESIDENTE.pdf'
filename = '1790721450001_DocumentosGenerales_Oficio Nombramiento Administradores_2020-09-04_ALVERNIA DE CHACON IMELDA_PRESIDENTE.pdf'
df_oficio_nombramiento_administradores = df_oficio_nombramiento_administradores[df_oficio_nombramiento_administradores['filename'] == filename]
# test the first values of the df between 100 and 200
for _, row in df_oficio_nombramiento_administradores.iterrows():
    extract_data_from_pdf(row, collection)

Run multithreaded

In [None]:
# Multithreading implementation
def process_pdf_data_in_parallel(df, collection, max_workers=5):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for each row to the executor, passing the collection explicitly
        futures = [executor.submit(extract_data_from_pdf, row, collection) for _, row in df.iterrows()]
        # Optionally, you can handle results or exceptions here
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # We can use this to raise exceptions if any occurred in the threads
            except Exception as e:
                # remove row from df if error
                print(f"Exception in thread: {e}")
                traceback.print_exc()


# Adjust the number of threads as needed
max_threads = 10  # Example: Use 5 threads

# Execute the function with multithreading, passing the collection explicitly
#process_pdf_data_in_parallel(df_oficio_nombramiento_administradores, collection, max_threads)