In [2]:
import json
import argparse
from pathlib import Path
from typing import List, Dict, Optional
import random


In [None]:
def load_chunks(file_path:Path) -> List[Dict]:
    chunks=[]
    if not file_path.exists():
        return chunks
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if not line.strip():
                    print(f"‚ö†Ô∏è  L√≠nea {line_num} est√° vac√≠a")
                    continue
                try: 
                    chunks.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"‚ö†Ô∏è  Error en l√≠nea {line_num} de {file_path}: {e}")
                    continue
    except Exception as e:
        print(f"‚ùå Error leyendo {file_path}: {e}")
    
    return chunks
        
        

In [None]:
import re


def filter_chunk_quality(chunk: Dict, min_words: int = 50, max_words: int = 2000) -> Optional[str]:

    text = chunk.get('content')
    if not text or not text.strip():
        return None
    
    words = text.split()
    word_count = len(words)
    if word_count < min_words:
        return None
    
    if word_count > max_words:
        text = ' '.join(words[:max_words])
        word_count = max_words
        
    text = ' '.join(text.split())

    return text
    
    



In [None]:

def format_text_with_context(text: str, metadata: Dict, include_context: bool = True) -> str:
    """
    Formatea texto con contexto opcional
    
    Args:
        text: Texto del chunk
        metadata: Metadata del chunk
        include_context: Si incluir contexto en el texto
    
    Returns:
        Texto formateado
    """
    if not include_context:
        return text
    
    # Construir contexto
    context_parts = []
    
    # Informaci√≥n del documento
    title = metadata.get('doc_title') or metadata.get('title', '')
    if title:
        context_parts.append(f"Document: {title}")
    
    source_type = metadata.get('source_type', '')
    if source_type:
        context_parts.append(f"Source: {source_type}")
    
    # Campos comunes del metadata
    category = metadata.get('category', '')
    if category:
        context_parts.append(f"Category: {category}")
    
    section_title = metadata.get('section_title')
    if section_title:
        context_parts.append(f"Section: {section_title}")
    
    section_level = metadata.get('section_level')
    if section_level is not None:
        context_parts.append(f"Section Level: {section_level}")
    
    page_num_real = metadata.get('page_num_real')
    if page_num_real is not None:
        context_parts.append(f"Page (real): {page_num_real}")
    
    page_num_logical = metadata.get('page_num_logical')
    if page_num_logical is not None:
        context_parts.append(f"Page (logical): {page_num_logical}")
    
    # Contexto espec√≠fico por fuente (MITRE)
    if 'tactic' in metadata:
        context_parts.append(f"MITRE Tactic: {metadata['tactic']}")
    
    if 'technique_id' in metadata:
        context_parts.append(f"MITRE Technique: {metadata['technique_id']}")
    
    if context_parts:
        context = "\n".join(context_parts)
        return f"{context}\n\n{text}"
    
    return text


In [11]:
# CASO DE PRUEBA: Cargar chunks desde un archivo real
print("=" * 70)
print("CASO DE PRUEBA: Cargar chunks desde archivo JSONL")
print("=" * 70)

# Usar un archivo real del proyecto
# test_file = Path("data/chunks/MITRE/pages.chunks.jsonl")
test_file = Path("/Users/marcosespana/Desktop/U/DatosTesis/data/chunks/AISecKG/pages.chunks.jsonl")


CASO DE PRUEBA: Cargar chunks desde archivo JSONL


In [12]:

print(f"\nüìÇ Archivo: {test_file}")
print(f"   Existe: {test_file.exists()}\n")



üìÇ Archivo: /Users/marcosespana/Desktop/U/DatosTesis/data/chunks/AISecKG/pages.chunks.jsonl
   Existe: True



In [13]:

# Cargar chunks
chunks = load_chunks(test_file)

# Mostrar resultados
print(f"‚úÖ Total de chunks cargados: {len(chunks)}")

‚úÖ Total de chunks cargados: 39


In [21]:
if chunks:
    print(f"\nüìÑ Primer chunk:")
    first_chunk = chunks[0]
    print(f"   Keys disponibles: {list(first_chunk.keys())}")
    
    # Mostrar contenido (primeros 200 caracteres)
    content = first_chunk.get('content', first_chunk.get('text', ''))
    print(f"   {content[:200]}...")
    if "content" in first_chunk:
        print(f"\n   Contenido:")
        content = first_chunk['content']
        print(f"   {content[:200]}...")
        
    if "metadata" in first_chunk:
        print(f"\n   Metadata:")
        metadata = first_chunk['metadata']
        for key, value in list(metadata.items())[:5]:
            print(f"     - {key}: {value}")
            
            


üìÑ Primer chunk:
   Keys disponibles: ['id', 'content', 'metadata']
   Analysis Point 4: What is your observation of the responding time of each type of scans, and what is the reason and implications? 3.2.4 IP ID (Idle) scanning 1. Find at least one accessible (open or c...

   Contenido:
   Analysis Point 4: What is your observation of the responding time of each type of scans, and what is the reason and implications? 3.2.4 IP ID (Idle) scanning 1. Find at least one accessible (open or c...

   Metadata:
     - category: NarrativeText
     - section_title: None
     - section_level: 0
     - page_num_real: None
     - page_num_logical: None


In [None]:
# if chunks:
#     print(f"\nüìÑ Primer chunk:")
#     first_chunk = chunks[0]
#     print(f"   Keys disponibles: {list(first_chunk.keys())}")
    
#     # Mostrar contenido (primeros 200 caracteres)
#     content = first_chunk.get('content', first_chunk.get('text', ''))
#     if content:
#         print(f"\n   Contenido (primeros 200 chars):")
#         print(f"   {content[:200]}...")
    
#     # Mostrar metadata si existe
#     if 'metadata' in first_chunk:
#         print(f"\n   Metadata:")
#         metadata = first_chunk['metadata']
#         for key, value in list(metadata.items())[:5]:  # Primeros 5 campos
#             print(f"     - {key}: {value}")
    
#     print(f"\nüìä Resumen:")
#     print(f"   - Total chunks: {len(chunks)}")
#     print(f"   - Estructura del primer chunk: {type(first_chunk)}")
#     print(f"   - Campos del chunk: {len(first_chunk)} campos")
# else:
#     print("‚ö†Ô∏è  No se cargaron chunks (archivo vac√≠o o no existe)")
