In [9]:
# Import necessary libraries
import json
import os
import re
import sys
from typing import Dict, Any, List

from PyPDF2 import PdfReader
from opensearchpy import OpenSearch, helpers
from sentence_transformers import SentenceTransformer

In [10]:
# Set up Python path to access project modules
sys.path.insert(0, "..")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Defining constants

# Opensearch connection settings
OPENSEARCH_HOST = "localhost"       # OpenSearch host
OPENSEARCH_PORT = 9200              # OpenSearch port
OPENSEARCH_INDEX = "documents"      # Index name for document storage

# Embeddings settings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"     # Model for generating embedding"
EMBEDDING_DIMENSION = 384                                           # Embedding dimension for the model
ASSYMETRIC_EMBEDDING = False                                        # Whether to use asymmetric embeddings

# Chunking settings
TEXT_CHUNK_SIZE = 500           # Number of tokens per chunk
TEXT_CHUNK_OVERLAP = 100        # Overlap between chunks

print("Constants defined")

Constants defined


In [12]:
# Utility functions for text preprocessing

def clean_text(text: str) -> str:
    """
    Cleans OCR-extracted text by removing unnecessary newlines, hyphens, and correcting common OCR errors.

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """
    # Remove hyphens at line breaks (e.g., 'exam-\nple' -> 'example')
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)

    # Replace newlines within sentences with spaces
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

    # Replace multiple newlines with a single newline
    text = re.sub(r"\n+", "\n", text)

    # Remove excessive whitespace
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()


def chunk_text(text: str, chunk_size: int, overlap: int = 100) -> List[str]:
    """
    Splits text into chunks with a specified overlap.

    Args:
        text (str): The text to split.
        chunk_size (int): The number of tokens in each chunk.
        overlap (int): The number of tokens to overlap between chunks.

    Returns:
        List[str]: A list of text chunks.
    """

    # Clean the text before chunking
    text = clean_text(text)

    # Tokeinze the texts into words
    tokens = text.split(" ")

    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = " ".join(chunk_tokens)
        chunks.append(chunk_text)
        start = end - overlap  # Move back by 'overlap' tokens

    return chunks

print("Utility functions defined")

Utility functions defined


In [13]:
# Embedding functions

def get_embedding_model():
    """
    Loads and returns the sentence transformer embedding model.
    
    Returns:
        SentenceTransformer: The loaded embedding model.
    """
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    return model



def generate_embeddings(texts: List[str]):
    """
    Generates embeddings for a list of text chunks.
    
    Args:
        texts (List[str]): List of text chunks to embed.
        
    Returns:
        List[numpy.ndarray]: List of embedding vectors.
    """
    model = get_embedding_model()
    
    # If using asymmetric embeddings, prefix each text with "passage: "
    if ASSYMETRIC_EMBEDDING:
        texts = [f"passage: {text}" for text in texts]
        
    # Generate embeddings
    embeddings = model.encode(texts)
    return embeddings

print("Embedding functions defined")


Embedding functions defined


# 1. Connect to OpenSearch and Create Index

In [14]:
# Create an OpenSearch client
client = OpenSearch(
    hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
    http_compress=True,
    timeout=30,
    max_retries=3,
    retry_on_timeout=True,
)

# Check connection
try:
    info = client.info()
    print(f"Successfully connected to OpenSearch {info['version']['number']}")
except Exception as e:
    print(f"Failed to connect to OpenSearch: {e}")
    print("Make sure OpenSearch is running on localhost:9200")
    raise

Successfully connected to OpenSearch 2.19.2


In [15]:
# Define the index configuration
def create_index_config() -> Dict[str, Any]:
    """
    Creates the index configuration with mappings for text, embeddings, and metadata.
    
    Returns:
        Dict[str, Any]: The index configuration.
    """
    config = {
        "settings": {
            "index": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "knn": True
            }
        },
        "mappings": {
            "properties": {
                "text": {
                    "type": "text"  # For standard text search  (BM25)
                },
                "embedding": {
                    "type": "knn_vector",
                    "dimension": EMBEDDING_DIMENSION,  # Match your embedding model's dimension
                    "method": {
                        "engine": "faiss",  # high-performance library for similarity search
                        "space_type": "l2", # Euclidean distance
                        "name": "hnsw",     # Algorithm used for fast approximate nearest-neighbor search — HNSW (Hierarchical Navigable Small World) graph.
                        "parameters": {}
                    }
                },
                "document_name": {
                    "type": "keyword"  # For exact match on document names
                }
            }
        }
    }
    return config

# Get the index configuration
index_config = create_index_config()
print("\nIndex Configuration:")
print(json.dumps(index_config, indent=2))


Index Configuration:
{
  "settings": {
    "index": {
      "number_of_shards": 1,
      "number_of_replicas": 0,
      "knn": true
    }
  },
  "mappings": {
    "properties": {
      "text": {
        "type": "text"
      },
      "embedding": {
        "type": "knn_vector",
        "dimension": 384,
        "method": {
          "engine": "faiss",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      },
      "document_name": {
        "type": "keyword"
      }
    }
  }
}


In [16]:
# Create the index if it doesn't exist
if not client.indices.exists(index=OPENSEARCH_INDEX):
    response = client.indices.create(index=OPENSEARCH_INDEX, body=index_config)
    print(f"\nCreated index {OPENSEARCH_INDEX} with response: {response}")
else:
    print(f"\nIndex {OPENSEARCH_INDEX} already exists")


Index documents already exists


In [17]:
from opensearchpy.exceptions import NotFoundError
pipeline_name = "nlp-search-pipeline"

try:
    result = client.transport.perform_request(
        "GET",
        f"/_search/pipeline/{pipeline_name}"
    )
    print(f"\n✅ Search pipeline '{pipeline_name}' exists.")
    print(result)
except NotFoundError:
    print(f"\n⚠️ Search pipeline '{pipeline_name}' does NOT exist.")
except Exception as e:
    print(f"\n🚨 Error: {e}")


✅ Search pipeline 'nlp-search-pipeline' exists.
{'nlp-search-pipeline': {'description': 'Post processor for hybrid search', 'phase_results_processors': [{'normalization-processor': {'normalization': {'technique': 'min_max'}, 'combination': {'technique': 'arithmetic_mean', 'parameters': {'weights': [0.3, 0.7]}}}}]}}


# 2. Process PDF Document

In [18]:
import os

cwd = os.getcwd()
print(cwd)

d:\local_rag\notebooks


In [19]:
# Read and process the PDF
pdf_path = "climate.pdf"  # Path relative to notebook directory

# Read the PDF file

reader = PdfReader(pdf_path)
text = "".join([page.extract_text() for page in reader.pages])
print(f"Extracted {len(text)} characters from {pdf_path}")

# Show a sample of the extracted text
print("\nSample of extracted text:")
print(text[:500] + "...")

# Clean the text
cleaned_text = clean_text(text)
print(f"\nText cleaned. Length: {len(cleaned_text)} characters")

# Chunk the text
chunks = chunk_text(cleaned_text, chunk_size=TEXT_CHUNK_SIZE, overlap=TEXT_CHUNK_OVERLAP)
print(f"Split text into {len(chunks)} chunks")

# Display a sample chunk
print("\nSample chunk:")
print(chunks[0])

Extracted 94936 characters from climate.pdf

Sample of extracted text:
3
1This Summary for Policymakers should be cited as:
IPCC, 2013: Summary for Policymakers. In: Climate Change 2013: The Physical Science Basis.  Contribution of 
Working Group I to the Fifth Assessment Report of the Intergovernmental Panel on Climate Change  [Stocker, 
T.F ., D. Qin, G.-K. Plattner, M. Tignor, S.K. Allen, J. Boschung, A. Nauels, Y . Xia, V. Bex and P .M. Midgley (eds.)]. 
Cambridge University Press, Cambridge, United Kingdom and New York, NY , USA.Summary  
for Policymakers SPM
...

Text cleaned. Length: 94017 characters
Split text into 38 chunks

Sample chunk:
3 1This Summary for Policymakers should be cited as: IPCC, 2013: Summary for Policymakers. In: Climate Change 2013: The Physical Science Basis. Contribution of Working Group I to the Fifth Assessment Report of the Intergovernmental Panel on Climate Change [Stocker, T.F ., D. Qin, G.-K. Plattner, M. Tignor, S.K. Allen, J. Boschung, A. Nauels, 

In [None]:
# pip install huggingface_hub[hf_xet]

In [23]:
pdf_file_name = pdf_path.replace('.pdf', '')

# Generate embeddings for the chunks
print("Generating embeddings for chunks. This might take a moment...")
embeddings = generate_embeddings(chunks)
print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding shape: {embeddings[0].shape}")

# Display a sample embedding (just a few values to avoid clutter)
print("\nSample embedding:")
print(embeddings[0])

# Prepare documents for indexing
documents_to_index = [
    {
        "doc_id": f"{pdf_file_name}_{i}",
        "text": chunk,
        "embedding": embedding,
        "document_name": pdf_file_name,
    }
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
]

print(f"\nPrepared {len(documents_to_index)} documents for indexing")

Generating embeddings for chunks. This might take a moment...
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Generated 38 embeddings
Embedding shape: (384,)

Sample embedding:
[-4.45851609e-02  1.83010846e-02  1.03055835e-01  1.06724694e-01
  6.92621246e-02  3.94779146e-02 -7.02718496e-02  1.85091645e-02
 -2.64174980e-03  8.67922455e-02 -2.22835559e-02 -5.14434166e-02
  3.07141431e-02  3.79239768e-02  2.55168844e-02  6.16954975e-02
 -8.45513493e-02 -9.24729630e-02 -1.43485926e-02  5.17308451e-02
  1.11267797e-03  2.32094545e-02 -2.65128221e-02  6.72897473e-02
  5.76642295e-03  2.66166579e-04 -9.05090123e-02  5.80190718e-02
 -6.24796264e-02  1.27771601e-01 -5.01407199e-02  7.86757320e-02
 -4.40925360e-02 -4.25539464e-02  4.44622301e-02 -1.00495229e-02
  1.28307687e-02 -2.88113579e-02 -7.05305040e-02  2.85778679e-02
  2.66953241e-02  5.05796680e-03 -2.76468080e-02 -4.56768908e-02
  1.51982978e-02  5.19504920e-02  5.31637520e-02 -3.04289814e-02
 -1.58158094e-01  3.95431854e-02  2.10130159e-02  5.45899011e-02
  2.01785006e-02 -4.29869145e-02 -3.48471701e-02 -8.01571310e-02
  3.24416645e-02  2.952

# 3. Ingest Documents into OpenSearch

In [25]:
documents_to_index

[{'doc_id': 'climate_0',
  'text': '3 1This Summary for Policymakers should be cited as: IPCC, 2013: Summary for Policymakers. In: Climate Change 2013: The Physical Science Basis. Contribution of Working Group I to the Fifth Assessment Report of the Intergovernmental Panel on Climate Change [Stocker, T.F ., D. Qin, G.-K. Plattner, M. Tignor, S.K. Allen, J. Boschung, A. Nauels, Y . Xia, V. Bex and P .M. Midgley (eds.)]. Cambridge University Press, Cambridge, United Kingdom and New York, NY , USA.Summary for Policymakers SPM Drafting Authors: Lisa V. Alexander (Australia), Simon K. Allen (Switzerland/New Zealand), Nathaniel L. Bindoff (Australia), François-Marie Bréon (France), John A. Church (Australia), Ulrich Cubasch (Germany), Seita Emori (Japan), Piers Forster (UK), Pierre Friedlingstein (UK/Belgium), Nathan Gillett (Canada), Jonathan M. Gregory (UK), Dennis L. Hartmann (USA), Eystein Jansen (Norway), Ben Kirtman (USA), Reto Knutti (Switzerland), Krishna Kumar Kanikicharla (India), 

In [26]:
# Prepare bulk actions for OpenSearch
actions = []
for doc in documents_to_index:
    # Handle asymmetric embedding if enabled
    if ASSYMETRIC_EMBEDDING:
        prefixed_text = f"passage: {doc['text']}"
    else:
        prefixed_text = doc['text']
    
    # Create an action for this document
    action = {
        "_index": OPENSEARCH_INDEX,
        "_id": doc["doc_id"],
        "_source": {
            "text": prefixed_text,
            "embedding": doc["embedding"].tolist(),  # Convert numpy array to list
            "document_name": doc["document_name"],
        },
    }
    actions.append(action)

In [27]:
# Perform bulk indexing
print(f"Indexing {len(actions)} documents into OpenSearch...")
try:
    success, errors = helpers.bulk(client, actions, raise_on_error=True)
    if errors:
        print(f"Indexed {success} documents with {len(errors)} errors")
        print(f"First error: {errors[0]}")
    else:
        print(f"Successfully indexed {success} documents")
except Exception as e:
    print(f"Error during bulk indexing: {e}")

Indexing 38 documents into OpenSearch...
Successfully indexed 38 documents


In [28]:
# Verify the documents are indexed
response = client.count(index=OPENSEARCH_INDEX)
print(f"Total documents in index: {response['count']}")

# Get one document to verify content
if response['count'] > 0:
    sample = client.search(
        index=OPENSEARCH_INDEX, 
        body={
            "size": 1,
            "_source": {"excludes": ["embedding"]},  # Exclude embeddings as they're large
            "query": {"match_all": {}}
        }
    )
    print("\nSample document from index:")
    print(json.dumps(sample['hits']['hits'][0]['_source'], indent=2))

Total documents in index: 38

Sample document from index:
{
  "document_name": "climate",
  "text": "3 1This Summary for Policymakers should be cited as: IPCC, 2013: Summary for Policymakers. In: Climate Change 2013: The Physical Science Basis. Contribution of Working Group I to the Fifth Assessment Report of the Intergovernmental Panel on Climate Change [Stocker, T.F ., D. Qin, G.-K. Plattner, M. Tignor, S.K. Allen, J. Boschung, A. Nauels, Y . Xia, V. Bex and P .M. Midgley (eds.)]. Cambridge University Press, Cambridge, United Kingdom and New York, NY , USA.Summary for Policymakers SPM Drafting Authors: Lisa V. Alexander (Australia), Simon K. Allen (Switzerland/New Zealand), Nathaniel L. Bindoff (Australia), Fran\u00e7ois-Marie Br\u00e9on (France), John A. Church (Australia), Ulrich Cubasch (Germany), Seita Emori (Japan), Piers Forster (UK), Pierre Friedlingstein (UK/Belgium), Nathan Gillett (Canada), Jonathan M. Gregory (UK), Dennis L. Hartmann (USA), Eystein Jansen (Norway), Ben Kir

In [29]:
# Search with keyword matching

query = {
    "size": 2,
    "_source": {"excludes": ["embedding"]},
    "query": {
        "match": {
            "text": "climate"
        }
    }
}
results = client.search(index=OPENSEARCH_INDEX, body=query)
for hit in results['hits']['hits']:
    print(json.dumps(hit['_source'], indent=2))

{
  "document_name": "climate",
  "text": "high confidence ), extremely unlikely less than 1\u00b0C ( high confidence ), and very unlikely greater than 6\u00b0C ( medium confidence )16. The lower temperature limit of the assessed likely range is thus less than the 2\u00b0C in the AR4, but the upper limit is the same. This assessment reflects improved understanding, the extended temperature record in the atmosphere and ocean, and new estimates of radiative forcing. {TS TFE.6, Figure 1; Box 12.2} \u2022 The rate and magnitude of global climate change is determined by radiative forcing, climate feedbacks and the storage of energy by the climate system. Estimates of these quantities for recent decades are consistent with the assessed likely range of the equilibrium climate sensitivity to within assessed uncertainties, providing strong evidence for our understanding of anthropogenic climate change. {Box 12.2, Box 13.1} \u2022 The transient climate response quantifies the response of the cli

In [30]:
query_text = "How is ocean warnings?"

# Generate embedding
query_embedding = generate_embeddings([query_text])[0].tolist()

# Set top_k
top_k = 3

query_body = {
    "_source": {"exclude": ["embedding"]},
    "query": {
        "hybrid": {
            "queries": [
                {"match": {"text": {"query": query_text}}},
                {
                    "knn": {
                        "embedding": {
                            "vector": query_embedding,
                            "k": top_k
                        }
                    }
                }
            ]
        }
    },
    "size": top_k
}

#This allows OpenSearch to rank documents by both semantic similarity and keyword relevance, giving better RAG results.

response = client.search(
        index=OPENSEARCH_INDEX, body=query_body, search_pipeline="nlp-search-pipeline"
    )

# Print the results
print(f"\nTop {top_k} results for query: '{query_text}'\n")
for i, hit in enumerate(response['hits']['hits'], 1):
    print(f"Result {i}:")
    print(json.dumps(hit['_source'], indent=2))
    print("-" * 60)

Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Result 1:
{
  "document_name": "climate",
  "text": "sea level. There is low confidence in region-specific projections of storminess and associated storm surges. m SREX assessed it to be very likely that mean sea level rise will contribute to future upward trends in extreme coastal high water levels.SPMSummary for Policymakers8B.2 Ocean Ocean warming dominates the increase in energy stored in the climate system, accounting for more than 90% of the energy accumulated between 1971 and 2010 ( high confidence ). It is virtually certain that the upper ocean (0\u2212700 m) warmed from 1971 to 2010 (see Figure SPM.3), and it likely warmed between the 1870s and 1971. {3.2, Box 3.1} \u2022 On a global scale, the ocean warming is largest near the surface, and the upper 75 m warmed by 0.11 [0.09 to 0.13] \u00b0C per decade over the period 1971 to 2010. Since AR4, instrumental biases in upper-ocean temperature records have been iden