## Generar instancias para entrenar el modelo Colombian-conflict-chatbot

In [None]:

!pip install pandas openai PyMuPDF python-dotenv openpyxl tiktoken


Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting openai
  Downloading openai-1.68.2-py3-none-any.whl.metadata (25 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1

In [4]:

import os
import json
import pandas as pd
import fitz  # PyMuPDF for PDF text extraction
import openai
from dotenv import load_dotenv
from tiktoken import encoding_for_model
import time


In [5]:

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


In [6]:

reports_df = pd.read_excel("../../resources/listado-informes.xlsx")


In [7]:

def extract_text(file_path):
    if file_path.lower().endswith('.pdf'):
        text = ""
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        return text.strip()
    elif file_path.lower().endswith('.csv'):
        df = pd.read_csv(file_path)
        return df.to_string(index=False).strip()
    else:
        return None


In [8]:

def load_prompts(ident, title):
    prompts = []
    for i in range(1, 4):
        prompt_path = f"prompts/basic_question{i}.txt"
        with open(prompt_path, "r", encoding="utf-8") as file:
            prompt = file.read().replace("{{report_name}}", f"{ident} {title}")
            prompts.append(prompt)
    return prompts


In [9]:
from openai import OpenAI

client = OpenAI(api_key=openai.api_key)
model_name = "gpt-4o"




# Count tokens
def count_tokens(text, model=model_name):
    encoding = encoding_for_model(model)
    return len(encoding.encode(text))

# Split text into chunks based on token limits
def split_text_into_chunks(text, max_tokens, model=model_name):
    words = text.split()
    chunks, current_chunk = [], []

    for word in words:
        current_chunk.append(word)
        if count_tokens(" ".join(current_chunk), model) >= max_tokens:
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Generate instances with OpenAI's modern API
import re

def extract_json_from_response(response_text):
    try:
        # Extraer contenido JSON dentro de ```json ```
        json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL)
        if json_match:
            json_content = json_match.group(1)
        else:
            json_content = response_text.strip()
        return json.loads(json_content)
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        print(f"Original response: {response_text}")
        return None

def generate_instances(text_chunks, prompts):
    combined_text = "\n\n".join(text_chunks)
    full_prompt = f"{combined_text}\n\n" + "\n\n".join(prompts) + "\n\n" \
                  "Please generate exactly three question-answer-context sets in JSON format. " \
                  "Return the JSON response directly, without explanations or additional text."

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You generate questions and answers from documents, returning JSON only."},
                {"role": "user", "content": full_prompt}
            ],
            temperature=0.3,
            max_tokens=3000
        )
        content = response.choices[0].message.content
        print(f"Generated content: {content}")
        instances = extract_json_from_response(content)
        return instances
    except Exception as e:
        print(f"Error generating instances: {e}")
        return None


In [11]:
# Main loop to process reports and generate dataset instances
reports_folder = "../../reports-pdf"
instances_generated = []

for idx, row in reports_df.head(10).iterrows():
    ident = row['ident']
    title = row['title']

    pdf_path = os.path.join(reports_folder, f"{ident}.pdf")

    if not os.path.exists(pdf_path):
        instances_generated.append({
            "report_id": ident,
            "status": "not-found",
            "created_by": model_name
        })
        continue

    text = extract_text(pdf_path)
    if not text:
        instances_generated.append({
            "report_id": ident,
            "status": "no-text",
            "created_by": model_name
        })
        continue

    max_tokens_per_chunk = 3000
    text_chunks = split_text_into_chunks(text, max_tokens_per_chunk)

    prompts = load_prompts(ident, title)
    instances = generate_instances(text_chunks, prompts)
    
    if instances and isinstance(instances, list):
        for instance in instances:
            if isinstance(instance, dict) and "question" in instance:
                instance.update({
                    "report_id": ident,
                    "status": "generated",
                    "created_by": model_name
                })
                instances_generated.append(instance)
            else:
                instances_generated.append({
                    "report_id": ident,
                    "status": "error",
                    "created_by": model_name
                })
    else:
        instances_generated.append({
            "report_id": ident,
            "status": "error",
            "created_by": model_name
        })
    print(f"Waiting 60 seconds to respect token rate limits...")
    time.sleep(60)



Generated content: ```json
[
    {
        "question": "¿Cuáles son los hallazgos principales del informe 058-CI-00233 Informe agroarte Colombia: Red Territorial de la Memoria sobre el conflicto armado colombiano?",
        "context": "Los argumentos conforme a los que estimamos necesario realizar actuaciones efectivas destinadas al reconocimiento y la protección de los lugares memoriales generados por la interacción de víctimas y colectivos sociales en los territorios entroncan con dos elementos esenciales de la realidad colombiana actual: - El valor de estos espacios para la consecución de una paz estable y duradera por el papel que potencialmente les corresponde en la construcción de memorias e identidades locales, - La ausencia, en el ordenamiento jurídico colombiano, de fórmulas articuladas que sean útiles para la defensa de su pervivencia en el tiempo.",
        "answer": "El informe destaca la importancia de los espacios de memoria para construir una paz duradera en Colombia, su

In [12]:
# Cell 9: Save generated instances to JSON file
with open("instances_generated.json", "w", encoding="utf-8") as f:
    json.dump(instances_generated, f, ensure_ascii=False, indent=4)


## Buscar el archivo con mas texto para saber cual es el mas largo y tenerlo como referencia

- el más grande tiene 162.000 palabras

Buscando los 10 archivos PDF/CSV más grandes...

Procesando los archivos...

Resultados:
+----+-------------------+--------+---------------+------------+
|    | Archivo           | Tipo   |   Tamaño (MB) |   Palabras |
+====+===================+========+===============+============+
|  0 | 365-CI-01208.pdf  | PDF    |        415.01 |      27571 |
+----+-------------------+--------+---------------+------------+
|  1 | 119-CI-00045.pdf  | PDF    |        358.58 |     339421 |
+----+-------------------+--------+---------------+------------+
|  2 | 748-CI-00863.pdf  | PDF    |        327.56 |      91304 |
+----+-------------------+--------+---------------+------------+
|  3 | 748-CI-00864.pdf  | PDF    |        327.56 |      91304 |
+----+-------------------+--------+---------------+------------+
|  4 | 365-CI-01260.pdf  | PDF    |        270.28 |     219309 |
+----+-------------------+--------+---------------+------------+
|  5 | 119-CI-00315.pdf  | PDF    |        258.05 |          0 |
+----+-------------------+--------+---------------+------------+
|  6 | 1308-CI-02024.pdf | PDF    |        210.18 |     168609 |
+----+-------------------+--------+---------------+------------+
|  7 | 365-CI-01192.pdf  | PDF    |        169.5  |     154781 |
+----+-------------------+--------+---------------+------------+
|  8 | 365-CI-01242.pdf  | PDF    |        152.88 |     103932 |
+----+-------------------+--------+---------------+------------+
|  9 | 748-CI-00872.pdf  | PDF    |        142.92 |      87690 |
+----+-------------------+--------+---------------+------------+

In [15]:
!pip install PyPDF2 tabulate



In [16]:
import os
import json
from PyPDF2 import PdfReader
from tabulate import tabulate

def get_largest_files(directory, extensions=('.pdf', '.csv'), top_n=10):
    """Obtiene los archivos más grandes con extensiones específicas"""
    files = []
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.lower().endswith(extensions):
            files.append((entry.path, entry.stat().st_size))
    
    # Ordenar por tamaño (de mayor a menor) y tomar los top_n
    files.sort(key=lambda x: x[1], reverse=True)
    return [file[0] for file in files[:top_n]]

def count_words_pdf(file_path):
    """Cuenta palabras en un archivo PDF"""
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        return len(text.split())
    except Exception as e:
        print(f"Error procesando PDF {file_path}: {str(e)}")
        return 0

def count_words_csv(file_path):
    """Cuenta palabras en un archivo CSV"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            return len(content.split())
    except Exception as e:
        print(f"Error procesando CSV {file_path}: {str(e)}")
        return 0

def process_files(file_paths):
    """Procesa los archivos y devuelve los resultados"""
    results = []
    for file_path in file_paths:
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        if file_path.lower().endswith('.pdf'):
            word_count = count_words_pdf(file_path)
        else:
            word_count = count_words_csv(file_path)
        
        results.append({
            "Archivo": os.path.basename(file_path),
            "Tipo": "PDF" if file_path.lower().endswith('.pdf') else "CSV",
            "Tamaño (MB)": f"{size_mb:.2f}",
            "Palabras": word_count
        })
    return results

def main():
    directory = input("Ingrese la ruta del directorio a analizar (deje vacío para usar el actual): ").strip()
    if not directory:
        directory = os.getcwd()
    
    if not os.path.isdir(directory):
        print("El directorio especificado no existe")
        return
    
    print("\nBuscando los 10 archivos PDF/CSV más grandes...")
    largest_files = get_largest_files(directory)
    
    if not largest_files:
        print("No se encontraron archivos PDF o CSV en el directorio")
        return
    
    print("\nProcesando los archivos...")
    results = process_files(largest_files)
    
    # Mostrar resultados en tabla
    print("\nResultados:")
    print(tabulate(results, headers="keys", tablefmt="grid", showindex=True))

if __name__ == "__main__":
    # Instalar dependencias si no están disponibles
    try:
        from PyPDF2 import PdfReader
        from tabulate import tabulate
    except ImportError:
        print("Instalando dependencias necesarias...")
        import subprocess
        subprocess.run(['pip', 'install', 'PyPDF2', 'tabulate'], check=True)
        from PyPDF2 import PdfReader
        from tabulate import tabulate
    
    main()


Buscando los 10 archivos PDF/CSV más grandes...

Procesando los archivos...

Resultados:
+----+-------------------+--------+---------------+------------+
|    | Archivo           | Tipo   |   Tamaño (MB) |   Palabras |
|  0 | 365-CI-01208.pdf  | PDF    |        415.01 |      27571 |
+----+-------------------+--------+---------------+------------+
|  1 | 119-CI-00045.pdf  | PDF    |        358.58 |     339421 |
+----+-------------------+--------+---------------+------------+
|  2 | 748-CI-00863.pdf  | PDF    |        327.56 |      91304 |
+----+-------------------+--------+---------------+------------+
|  3 | 748-CI-00864.pdf  | PDF    |        327.56 |      91304 |
+----+-------------------+--------+---------------+------------+
|  4 | 365-CI-01260.pdf  | PDF    |        270.28 |     219309 |
+----+-------------------+--------+---------------+------------+
|  5 | 119-CI-00315.pdf  | PDF    |        258.05 |          0 |
+----+-------------------+--------+---------------+------------+
