In [None]:
def get_gemini_client():
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

In [None]:
def get_embedding(text, model="models/embedding-001"):
    """Get embedding for a single piece of text"""
    result = genai.embed_content(
        model=model,
        content=text,
        task_type="retrieval_document"  # or "retrieval_query", "classification", etc.
    )
    return result['embedding']

In [None]:
def get_json_context(path_schema):
    #read schema
    with open(path_schema, 'r', encoding='utf-8') as f:
        context_data = json.load(f)
        context_str = json.dumps(context_data) #convert to string well formatted
        context_str = "**JSON SCHEMA TO FOLLOW:**\n" + context_str
    return context_str

In [None]:
def chunk_text(texto):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(texto)
    return chunks


In [None]:
def process_chunks():
    #leemos schema
    with open('../../JSONS/baseSchema.json', 'r', encoding='utf-8') as f:
        json_schema = json.load(f)
    
    #leemos prompt para el LLM (inicial)
    with open('jsonFillPrompt.txt', 'r', encoding='utf-8') as f:
        initial_instructions = f.read()
    
    #leemos prompt de refinamiento
    with open('jsonRefinePrompt.txt', 'r', encoding='utf-8') as f:
        refine_instructions = f.read()

    #leemos documento legal
    reader = PdfReader('../../Documentos/CODIGO PENAL DE LA NACION ARGENTINA.pdf')
    legal_doc = ""
    for page in reader.pages:
        legal_doc += page.extract_text() + "\n"

    # Dividir en chunks
    legal_doc_chunks = chunk_text(legal_doc)
    print(f"Total de chunks: {len(legal_doc_chunks)}")
    
    # Limitar a 8 chunks
    legal_doc_chunks = legal_doc_chunks[:2]
    print(f"Procesando solo los primeros {len(legal_doc_chunks)} chunks")

    # PASO 1: Crear JSON inicial con el primer chunk + schema completo
    first_chunk_prompt = (
        initial_instructions +
        "\n\n**JSON SCHEMA:**\n" +
        json.dumps(json_schema, indent=2) +
        "\n\n**DOCUMENT:**\n" +
        legal_doc_chunks[0]
    )
    
    refined_json_dict = {}
    
    print("Procesando chunk 1...")
    model = GenerativeModel("gemini-2.0-flash-exp")
    response = model.generate_content(first_chunk_prompt)
    # Parsear respuesta inicial
    try:
        refined_json_dict = json.loads(response.text)
    except json.JSONDecodeError:
        # If model added extra text, extract JSON portion
        import re
        match = re.search(r'\{.*\}', response.text, re.DOTALL)
        if match:
            refined_json_dict = json.loads(match.group(0))
        else:
            raise ValueError("First chunk did not return valid JSON")
    
    # print("JSON inicial creado: ", response.text)
    # PASO 2: Refinar con chunks restantes (SIN enviar el schema completo)
    for i, chunk in enumerate(legal_doc_chunks[1:], start=2):
        print(f"Procesando chunk {i}/{len(legal_doc_chunks)}...")
        
        # Prompt simplificado: instrucciones + JSON actual + nuevo chunk
        refine_prompt = (
            refine_instructions +
            "\n\n**EXISTING JSON:**\n" +
            json.dumps(refined_json_dict, ensure_ascii=False) +
            "\n\n**NEW CHUNK:**\n" +
            chunk
        )
        
        response = model.generate_content(refine_prompt)
        # Actualizar JSON refinado
        # refined_json = response.text
        
        # Parse JSON safely
        try:
            refined_json_dict = json.loads(response.text)
        except json.JSONDecodeError:
            match = re.search(r'\{.*\}', response.text, re.DOTALL)
            if match:
                refined_json_dict = json.loads(match.group(0))
            else:
                print(f"Warning: Chunk {i} returned invalid JSON, skipping update")
                
        # Guardar resultado final
        print("Guardando resultado...")
        with open('./processedDocs/response.json', 'w', encoding='utf-8') as f:
            json.dump(refined_json_dict, f, ensure_ascii=False, indent=2)
                
        return refined_json_dict

In [None]:
import json
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")

index_name = "lawai_legal_docs"

es.indices.delete(index=index_name, ignore=[400, 404])

with open("../elasticSearch/lawai_mapping.json") as f:
    mapping = json.load(f)["mapping"]

# es.indices.create(index="lawai", mappings=mapping)

In [77]:

import google.generativeai as genai
from google.generativeai import GenerativeModel
from dotenv import load_dotenv
import os
import json
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter # División de texto
from elasticsearch import Elasticsearch
import ollama


In [78]:
def get_embedding_ollama(text: str, model="nomic-embed-text"):
    response = ollama.embeddings(model=model, prompt=text)
    return response["embedding"]

In [79]:
#main
if __name__ == "__main__":
    get_gemini_client()
    
    # refined_json_dict = process_chunks()
    with open('./processedDocs/response.json') as f:
        refined_json_dict = json.load(f)  
       
    # Elastic Search: define index
    with open('../elasticSearch/lawai_mapping.json') as f:
        lawai_mapping = json.load(f)  
    
    es = Elasticsearch("http://localhost:9200")

    index_name = "lawai_legal_docs"
    
    if not es.indices.exists(index=index_name):
        es.indices.create(index=index_name, body=lawai_mapping["mapping"])
        print(f"Index '{index_name}' created successfully!")
    else:
        print("Index already exists.")
    

    # refined_json_dict = json.loads(refined_json)
    
    # generate embeddings for the full text
    full_text = refined_json_dict["content"]["full_text"]
    # embedding = get_embedding(full_text)
    embedding = ollama.embeddings(model="nomic-embed-text", prompt=full_text)["embedding"]

    # 3. store in JSON
    refined_json_dict.setdefault("analysis", {})
    refined_json_dict["analysis"]["embeddings"] = embedding
    
    # Index (insert) the document
    es.index(
        index=index_name, 
        id=refined_json_dict["document_id"], 
        document=refined_json_dict
    )
    print("Document indexed successfully!")
    # print("¡Proceso completado!: \n"+ refined_json_dict)

    # Query
    query = {
        "query": {
            "match": {
                "metadata.title": "Constitución"
            }
        }
    }

    results = es.search(index=index_name, body=query)
    for hit in results["hits"]["hits"]:
        print(hit["_source"]["metadata"]["title"])

Index already exists.
Document indexed successfully!
