In [None]:
!pip install boto3 aiohttp asyncio

#**Creación del dataset Tabular**

In [3]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import time
from getpass import getpass
import asyncio
import aiohttp
import boto3
import time
from concurrent.futures import ThreadPoolExecutor

##Iniciar sesión con credenciales en aws

In [7]:
aws_access_key_id = getpass("Access Key: ")
aws_secret_access_key = getpass("Secret Key: ")

session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)
s3 = session.client("s3")
textract = session.client("textract")

Access Key: ··········
Secret Key: ··········


## Leer dataset de pdfs en el bucket de S3

In [8]:
def list_pdf_files(bucket_name):
    paginator = s3.get_paginator('list_objects_v2')
    operation_parameters = {
        'Bucket': bucket_name,
        'Prefix': 'train/'
    }
    page_iterator = paginator.paginate(**operation_parameters)

    pdf_files = []
    for page in page_iterator:
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('.pdf'):
                label = key.split('/')[0]
                pdf_files.append((key, label))
    return pdf_files

bucket_name = "meli-challege-bucket"
pdf_files = list_pdf_files(bucket_name)
print("Cantidad de documentos encontrados:", len(pdf_files))

Cantidad de documentos encontrados: 185


## Inicia el análisis de texto de un PDF en S3 usando Textract y devuelve el job_id.

In [8]:
async def load_pdf_and_start_textract(file_key: str, label: str, bucket_name: str):
    s3 = session.client('s3')
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    file_bytes = response['Body'].read()

    loop = asyncio.get_event_loop()
    job_id = await loop.run_in_executor(None, lambda: textract.start_document_text_detection(
        DocumentLocation={'S3Object': {'Bucket': bucket_name, 'Name': file_key}}
    )['JobId'])

    return {"job_id": job_id, "file_key": file_key, "label": label}

##Espera el resultado del job de Textract y devuelve el texto extraído

In [13]:
def get_textract_result(job_id):
    while True:
        response = textract.get_document_text_detection(JobId=job_id)
        status = response['JobStatus']
        if status == 'SUCCEEDED':
            blocks = response['Blocks']
            lines = [b["Text"] for b in blocks if b["BlockType"] == "LINE"]
            return " ".join(lines)
        elif status == 'FAILED':
            return ""
        time.sleep(5)

##Procesa varios pdfs: lanza los jobs y obtiene el texto de cada uno.

In [14]:
async def process_all_files(file_list, bucket_name):
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [
            load_pdf_and_start_textract(file_key, label, bucket_name)
            for file_key, label in file_list
        ]
        jobs = await asyncio.gather(*futures)

        results = []
        for job in jobs:
            text = await loop.run_in_executor(executor, get_textract_result, job['job_id'])
            results.append({
                "filename": job["file_key"],
                "label": job["label"],
                "text": text
            })
        return results

## Guardar resultados en un archivo csv

In [15]:
def save_results_to_csv(results, filename="dataset.csv"):
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)
    print("CSV guardado como", filename)


In [None]:
results = await process_all_files(pdf_files, bucket_name)

In [None]:
save_results_to_csv(results)