# Retriever Experiments with Article-Based Chunking & Metadata

This notebook demonstrates splitting legal documents by Articles and including metadata (Article Number) in the chunks.

In [None]:
import sys
import os
import docx
import re

sys.path.append(os.getcwd())

from retriever import Retriever

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
!pip install python-docx



## 1. Chunking with Metadata
We extract the Article number and create a structured chunk with `text` and `metadata` fields.

In [2]:
def load_docx_text(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found")
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def chunk_by_article_with_metadata(text):
    text = re.sub(r'\s+', ' ', text)
    pattern = r'Article (\d+)\.'
    matches = list(re.finditer(pattern, text))
    
    chunks = []
    for i, match in enumerate(matches):
        article_num = match.group(1)
        start_idx = match.start()
        if i + 1 < len(matches):
            end_idx = matches[i+1].start()
        else:
            end_idx = len(text)
            
        chunk_text = text[start_idx:end_idx].strip()
        
        if chunk_text:
            chunks.append({
                "text": chunk_text,
                "metadata": {
                    "article_number": article_num,
                }
            })
            
    return chunks

## 2. Generate Chunks

In [None]:
import json
import os
import getpass
import dotenv
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("API_KEY")

class LLM:
    def __init__(self, provider, model):
        self.provider = provider
        self.model_name = model
        if provider and not model.startswith(f"{provider}/"):
            self.full_model_name = f"{provider}/{model}"
        else:
            self.full_model_name = model

    def generate(self, prompt):
        try:
            response = completion(
                model=self.full_model_name,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error: {str(e)}"

llm = LLM(provider="groq", model="llama-3.3-70b-versatile")

def enrich_with_llm(chunk, llm):
    text = chunk.get('text', '')[:1000]
    prompt = f"""
    Analyze the following legal text.
    Return ONLY a JSON object (no markdown) with the following keys:
    - "topics": A list of 2-3 key topics.
    - "category": One of ["Rights", "Governance", "Judiciary", "Economy", "General"].

    Text: {text}
    """
    response = llm.generate(prompt)
    
    if response.startswith("Error:"):
        print(f"LLM Generation Failed: {response}")
        return {}

    try:
        # Clean response
        clean = response.replace('```json', '').replace('```', '').strip()
        return json.loads(clean)
    except Exception as e:
        print(f"JSON Parse Error: {e}. Response was: {response[:100]}...")
        return {}

dataset_path = "dataset-eng.docx"
full_text = load_docx_text(dataset_path)

print("--- Generating Metadata Chunks ---")
chunks_meta = chunk_by_article_with_metadata(full_text)
print(f"Generated {len(chunks_meta)} chunks.")

for i, chunk in enumerate(chunks_meta[89:95]):
    extra_meta = enrich_with_llm(chunk, llm)
    if extra_meta:
        chunk['metadata'].update(extra_meta)
        print(f"Chunk {i+1} Enriched: {extra_meta}")
    else:
        print(f"Chunk {i+1} Failed to enrich.")


--- Generating Metadata Chunks ---
Generated 158 chunks.
Chunk 1 Enriched: {'topics': ['Constitutional Powers', 'Legislative Term', 'Executive Authority'], 'category': 'Governance'}
Chunk 2 Enriched: {'topics': ['Legislative Process', 'Constitutional Law', 'Ukrainian Governance'], 'category': 'Governance'}
Chunk 3 Enriched: {'topics': ['Human Rights', 'Citizenship', 'Social Protection'], 'category': 'Rights'}
Chunk 4 Enriched: {'topics': ['Legislative Initiative', 'Ukrainian Government', 'Lawmaking Process'], 'category': 'Governance'}
Chunk 5 Enriched: {'topics': ['Legislative Process', 'Executive Power', 'Constitutional Law'], 'category': 'Governance'}
Chunk 6 Enriched: {'topics': ['Budgetary System', 'State Expenditures', 'Public Finance'], 'category': 'Economy'}


In [4]:
def save_chunks_to_json(chunks, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)


In [None]:
output_path = "chunks_with_metadata.json"
save_chunks_to_json(chunks_meta, output_path)

print(f"Saved {len(chunks_meta)} chunks to {output_path}")


Saved 158 chunks to C:\Users\User\rag-nlp\chunks_with_metadata6.json


## 3. Retrieve and Inspect
We update the experiment function to correctly handle and display metadata.

In [4]:
def get_chunk_text(chunk):
    if isinstance(chunk, dict):
        return chunk.get("text", "")
    return str(chunk)

def experiment(retriever_instance, query, method_name, top_k=3):
    print(f"\nQUERY: {query}")
    print(f"METHOD: {method_name}")
    print("="*60)
    
    print(f"--- BM25 ---")
    results = retriever_instance.search_bm25(query, top_k)
    for i, r in enumerate(results):
        chunk = r['chunk']
        text = get_chunk_text(chunk)
        meta = chunk.get('metadata', {}) if isinstance(chunk, dict) else {}
        
        print(f"[{i+1}] Score: {r['score']:.4f}")
        if meta:
            print(f"Metadata: {meta}")
        print(f"Content: {text[:150].replace(chr(10), ' ')}...")
        print("-" * 20)

    print(f"\n--- Semantic ---")
    results = retriever_instance.search_semantic(query, top_k)
    for i, r in enumerate(results):
        chunk = r['chunk']
        text = get_chunk_text(chunk)
        meta = chunk.get('metadata', {}) if isinstance(chunk, dict) else {}
        
        print(f"[{i+1}] Dist: {r['score']:.4f}")
        if meta:
            print(f"Metadata: {meta}")
        print(f"Content: {text[:150].replace(chr(10), ' ')}...")
        print("-" * 20)

In [None]:
import retriever
import importlib
importlib.reload(retriever)
from retriever import Retriever

print("Initializing Retriever with Metadata Chunks (English Model)...")
retriever_meta = Retriever(chunks=chunks_meta)

query = "What is recognised as the highest social value in Ukraine?"
experiment(retriever_meta, query, "Article Metadata Chunking (English)")

Initializing Retriever with Metadata Chunks (English Model)...
Loaded 158 chunks from memory.
Initializing BM25...
Initializing Semantic Search (loading model)...
Generating embeddings...


Batches: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]

Retriever initialization, complete.

QUERY: What is recognised as the highest social value in Ukraine?
METHOD: Article Metadata Chunking (English)
--- BM25 ---
[1] Score: 21.8988
Metadata: {'article_number': '3', 'type': 'article'}
Content: Article 3. The human being, his or her life and health, honour and dignity, inviolability and security shall be recognised in Ukraine as the highest s...
--------------------
[2] Score: 13.8446
Metadata: {'article_number': '19', 'type': 'article'}
Content: Article 19. The legal order in Ukraine shall be based on the principles according to which no one may be forced to do what is not stipulated by law. G...
--------------------
[3] Score: 13.1067
Metadata: {'article_number': '15', 'type': 'article'}
Content: Article 15. Social life in Ukraine shall be based on the principles of political, economic, and ideological diversity. No ideology shall be recognised...
--------------------

--- Semantic ---
[1] Dist: 0.6760
Metadata: {'article_number': '3', '




In [None]:
import retriever
import importlib
importlib.reload(retriever)
from retriever import Retriever

print("Initializing Retriever with Metadata Chunks (English Model)...")
retriever_meta = Retriever(chunks=chunks_meta)

query = "What does Ukraine consider the most important social value?"
experiment(retriever_meta, query, "Article Metadata Chunking (English)")

Initializing Retriever with Metadata Chunks (English Model)...
Loaded 158 chunks from memory.
Initializing BM25...
Initializing Semantic Search (loading model)...
Generating embeddings...


Batches: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]

Retriever initialization, complete.

QUERY: What does Ukraine consider the most important social value?
METHOD: Article Metadata Chunking (English)
--- BM25 ---
[1] Score: 14.2993
Metadata: {'article_number': '17', 'type': 'article'}
Content: Article 17. Protecting the sovereignty and territorial integrity of Ukraine, ensuring its economic and information security shall be the most importan...
--------------------
[2] Score: 11.2597
Metadata: {'article_number': '3', 'type': 'article'}
Content: Article 3. The human being, his or her life and health, honour and dignity, inviolability and security shall be recognised in Ukraine as the highest s...
--------------------
[3] Score: 9.7513
Metadata: {'article_number': '19', 'type': 'article'}
Content: Article 19. The legal order in Ukraine shall be based on the principles according to which no one may be forced to do what is not stipulated by law. G...
--------------------

--- Semantic ---
[1] Dist: 0.7006
Metadata: {'article_number': '3', '




## 4. Inspect Data Structure

In [7]:
import json
print(json.dumps(chunks_meta[:2], indent=2, ensure_ascii=False))

[
  {
    "text": "Article 1. Ukraine shall be a sovereign and independent, democratic, social, law-based state. {For official interpretation of Article 1, see Constitutional Court Judgment No. 3-rp/2012 of 25 January 2012}",
    "metadata": {
      "article_number": "1",
      "type": "article"
    }
  },
  {
    "text": "Article 2. The sovereignty of Ukraine shall extend throughout its entire territory. Ukraine shall be a unitary state. The territory of Ukraine within its present border shall be indivisible and inviolable.",
    "metadata": {
      "article_number": "2",
      "type": "article"
    }
  }
]


## 5. Save Chunks
We save the processed chunks with metadata to a JSON file for future use.

In [8]:
output_file = "chunks.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(chunks_meta, f, ensure_ascii=False, indent=2)
    
print(f"Saved {len(chunks_meta)} chunks to {output_file}")

Saved 158 chunks to chunks.json


## 6. Integration with LLM (Generation)
We use **LiteLLM** with a custom wrapper to interact with various providers (Groq, OpenAI, Ollama).

In [None]:
try:
    import litellm
except ImportError:
    !pip install litellm
    import litellm

from litellm import completion
import os

class LLM:
    def __init__(self, provider, model):
        self.provider = provider
        self.model_name = model
        if provider and not model.startswith(f"{provider}/"):
            self.full_model_name = f"{provider}/{model}"
        else:
            self.full_model_name = model
            
    def generate(self, prompt):
        try:
            response = completion(
                model=self.full_model_name,
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Generation Error with {self.full_model_name}: {str(e)}"

Collecting litellm
  Using cached litellm-1.80.9-py3-none-any.whl.metadata (30 kB)
Collecting aiohttp>=3.10 (from litellm)
  Downloading aiohttp-3.13.2-cp312-cp312-win_amd64.whl.metadata (8.4 kB)
Collecting click (from litellm)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting fastuuid>=0.13.0 (from litellm)
  Using cached fastuuid-0.14.0-cp312-cp312-win_amd64.whl.metadata (1.1 kB)
Collecting grpcio<1.68.0,>=1.62.3 (from litellm)
  Downloading grpcio-1.67.1-cp312-cp312-win_amd64.whl.metadata (4.0 kB)
Collecting importlib-metadata>=6.8.0 (from litellm)
  Using cached importlib_metadata-8.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting jsonschema<5.0.0,>=4.22.0 (from litellm)
  Downloading jsonschema-4.25.1-py3-none-any.whl.metadata (7.6 kB)
Collecting openai>=2.8.0 (from litellm)
  Using cached openai-2.11.0-py3-none-any.whl.metadata (29 kB)
Collecting python-dotenv>=0.2.0 (from litellm)
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecti

In [None]:

llm = LLM(provider="groq", model="llama-3.3-70b-versatile")

print("Configured Groq with model: llama-3.3-70b-versatile")

Configured Groq with model: llama-3.3-70b-versatile


In [None]:
def rag_pipeline(retriever, query, llm_instance, top_k=3):
    print(f"Query: {query}")
    
    results = retriever.search_semantic(query, top_k=top_k)
    
    context_parts = []
    for r in results:
        chunk = r['chunk']
        txt = get_chunk_text(chunk)
        if isinstance(chunk, dict) and 'metadata' in chunk:
             meta = chunk['metadata']
             txt = f"[Article {meta.get('article_number', '?')}] {txt}"
        context_parts.append(txt)
    
    full_context = "\n\n".join(context_parts)
    print(f"Retrieved {len(results)} chunks.")
    
    print(f"Generating answer with {llm_instance.full_model_name}...")
    
    prompt = f"""You are a helpful assistant. Answer the question based ONLY on the following context.
    
    Context:
    {full_context}
    
    Question: {query}
    
    Answer:"""
    
    answer = llm_instance.generate(prompt)
    
    print("\n--- Answer ---")
    print(answer)
    return answer

query = "What is the highest social value in Ukraine?"
rag_pipeline(retriever_meta, query, llm)

Query: What is the highest social value in Ukraine?
Retrieved 3 chunks.
Generating answer with groq/llama-3.3-70b-versatile...

--- Answer ---
The human being, his or her life and health, honour and dignity, inviolability and security.


'The human being, his or her life and health, honour and dignity, inviolability and security.'

## 7. Reranking
We implement a **Cross-Encoder** to re-score the top retrieved results. This improves precision by considering the query and document interaction more deeply than vector similarity.

In [None]:
from sentence_transformers import CrossEncoder

try:
    reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
    print("CrossEncoder loaded successfully.")
except Exception as e:
    print(f"Error loading CrossEncoder: {e}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


CrossEncoder loaded successfully.


In [None]:
def rerank_results(query, initial_results, top_k=3):
    """Reranks a list of retrieved results using the CrossEncoder."""
    if not initial_results:
        return []
    
    pairs = []
    for res in initial_results:
        chunk = res['chunk']=
        text = get_chunk_text(chunk) 
        pairs.append([query, text])
    
    scores = reranker_model.predict(pairs)
    
    reranked_results = []
    for i, res in enumerate(initial_results):
        new_res = res.copy()
        new_res['rerank_score'] = float(scores[i])
        reranked_results.append(new_res)
        
    reranked_results.sort(key=lambda x: x['rerank_score'], reverse=True)
    
    return reranked_results[:top_k]

def rag_pipeline_with_rerank(retriever, query, llm_instance, retrieve_top_k=10, final_top_k=3):
    print(f"QUERY: {query}")
    
    print(f"1. Retrieving top {retrieve_top_k} candidates...")
    initial = retriever.search_semantic(query, top_k=retrieve_top_k)
    
    print(f"2. Reranking...")
    reranked = rerank_results(query, initial, top_k=final_top_k)
    
    print("\n--- Top Chunks after Reranking ---")
    context_parts = []
    for i, r in enumerate(reranked):
        chunk = r['chunk']
        txt = get_chunk_text(chunk)
        score = r.get('rerank_score', 0)
        print(f"[{i+1}] Score: {score:.4f} | {txt[:100]}...")
        
        if isinstance(chunk, dict) and 'metadata' in chunk:
             meta = chunk['metadata']
             txt = f"[Article {meta.get('article_number', '?')}] {txt}"
        context_parts.append(txt)
    
    full_context = "\n\n".join(context_parts)
    
    print(f"\n3. Generating answer with {llm_instance.full_model_name}...")
    prompt = f"""You are a helpful assistant. Answer the question based ONLY on the following context.
    
    Context:
    {full_context}
    
    Question: {query}
    
    Answer:"""
    
    answer = llm_instance.generate(prompt)
    
    print("\n--- Answer ---")
    print(answer)
    return answer

In [None]:
query = "What is the highest social value in Ukraine?"
rag_pipeline_with_rerank(retriever_meta, query, llm, retrieve_top_k=10, final_top_k=3)

QUERY: What is the highest social value in Ukraine?
1. Retrieving top 10 candidates...
2. Reranking...

--- Top Chunks after Reranking ---
[1] Score: 7.4847 | Article 3. The human being, his or her life and health, honour and dignity, inviolability and securi...
[2] Score: -0.2736 | Article 15. Social life in Ukraine shall be based on the principles of political, economic, and ideo...
[3] Score: -2.3903 | Article 95. The budgetary system of Ukraine shall be based on the principles of fair and impartial d...

3. Generating answer with groq/llama-3.3-70b-versatile...

--- Answer ---
The human being, his or her life and health, honour and dignity, inviolability and security.


'The human being, his or her life and health, honour and dignity, inviolability and security.'