### Step 0: Setup

In [None]:
pip install -r requirements.txt

In [1]:
import os
import uuid

from opensearchpy import OpenSearch
from dotenv import load_dotenv
from openai import OpenAI
import pdfplumber

load_dotenv()
client = OpenAI()  


### README
Step 3 - 8 do not need to be run again. All data is vectorized and in the database now.

### Step 1: Connect to Opensearch

In [6]:
OSDB_PASSWORD = os.getenv("OSDB_PASSWORD")

os_client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200, "scheme": "https"}],
    http_auth=("admin", OSDB_PASSWORD),
    verify_certs=False,  # ignore self-signed cert
    ssl_show_warn=False
)

if os_client.ping():
    print("✅ Connected to OpenSearch!")
else:
    print("❌ Connection failed.")


✅ Connected to OpenSearch!


### Step 2: Connect to OPENAI

In [3]:
try:
    resp = client.models.list()
    print("OpenAI API works, models found:", [m.id for m in resp.data[:3]])
except Exception as e:
    print("OpenAI connection error:", e)

OpenAI API works, models found: ['gpt-4-0613', 'gpt-4', 'gpt-3.5-turbo']


### Step 3: Create open index

In [None]:
INDEX_NAME = "rag_docs_v1"
EMBED_DIM = 1536  

mapping = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 512,
        }
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "metadata": {"type": "object"},
            "embedding": {"type": "knn_vector", "dimension": EMBED_DIM},
        }
    },
}

if not os_client.indices.exists(index=INDEX_NAME):
    os_client.indices.create(index=INDEX_NAME, body=mapping)
    print(f"Index '{INDEX_NAME}' created.")
else:
    print(f"Index '{INDEX_NAME}' already exists.")


### Step 4: Extract text from PDF

In [None]:
def extract_text_from_pdf(path):
    texts = []
    with pdfplumber.open(path) as pdf:
        for i, p in enumerate(pdf.pages, 1):
            txt = p.extract_text()
            if txt:
                print(f"Page {i}: {len(txt)} characters")
                # print(txt)
                texts.append(txt)
    return "\n\n".join(texts)

text = extract_text_from_pdf("data/thesis.pdf")
print("Total extracted characters:", len(text))


### Step 5: Chunk the text

In [None]:
def chunk_text(text, chunk_size=3000, overlap=300):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())
        start = end - overlap
    return chunks

chunks = chunk_text(text)
print(f"Created {len(chunks)} chunks")
print("First chunk preview:\n", chunks[0][:500])


### Step 6: Generate Embeddings

In [None]:
resp = client.embeddings.create(
    model="text-embedding-3-small",
    input=chunks[:2]
)

print("Embedding length:", len(resp.data[0].embedding))

### Step 7: Index chunks

In [None]:
for chunk in chunks:
    # Generate embedding using the embeddings model. Currently text-embeddings-3-small
    resp = client.embeddings.create(
        model="text-embedding-3-small",
        input=chunk
    )
    emb = resp.data[0].embedding  

    # todo: find out more about metadata and how to use this in showing sources
    doc = {
        "text": chunk,
        "metadata": {"chunk_id": str(uuid.uuid4()), "source": "thesis.pdf"},
        "embedding": emb,
    }

    res = os_client.index(index=INDEX_NAME, body=doc)
    print("Indexed doc:", res["_id"])

### Step 8: Query with kNN

In [None]:
"""
Testing kNN with a simple query
can be deleted later on.
"""

query = "What is this PDF about?"

# embed the query so kNN can happen
resp = client.embeddings.create(
    model="text-embedding-3-small",
    input=query
)
qemb = resp.data[0].embedding

# kNN search in OpenSearch
body = {
    "size": 3,
    "query": {
        "knn": {
            "embedding": {
                "vector": qemb,
                "k": 3
            }
        }
    }
}

res = os_client.search(index=INDEX_NAME, body=body)

for h in res["hits"]["hits"]:
    print("Score:", h["_score"])
    print("Snippet:", h["_source"]["text"][:200], "\n")


### Step 9: Query Embedding

In [7]:
"""
Write a query about the document and embed the query.
"""
query = "what are examples of stress-inducing and stress-reducing factors?"

# Generate embedding
resp = client.embeddings.create(
    model="text-embedding-3-small",
    input=query
)

q_embedding = resp.data[0].embedding  # <-- new way to access the vector

print("Query embedding length:", len(q_embedding))

Query embedding length: 1536


### Step 10: Retrieve top K relevant chunks from Opensearch

In [8]:
"""
Retrieval step. Top 5 chunks that match the query
"""

TOP_K = 5  # number of chunks to retrieve

body = {
    "size": TOP_K,
    "query": {
        "knn": {
            "embedding": {
                "vector": q_embedding,
                "k": TOP_K
            }
        }
    }
}

res = os_client.search(index="rag_docs_v1", body=body)

retrieved_chunks = []
for hit in res["hits"]["hits"]:
    retrieved_chunks.append(hit["_source"]["text"])
    print("Score:", hit["_score"], "Snippet:", hit["_source"]["text"][:150], "\n")

Score: 0.49664328 Snippet: doorforincreasedscrutiny,which
goriesstress-inducing,stress-reducing,mixedornotidentifiable.
canbeapositivething.However,italsomakesthemvulner-
ableto 

Score: 0.48576152 Snippet: imate-relatedtopics
4.2.3 Stressfactoranalysis. Eachtopic’scoherencewasassessed Thisstudyaimedtofindthetopicsofconversationwhendiscussing
and consider 

Score: 0.48376673 Snippet: ens,speciesmigration,desertification,and
guage.Theyarederivedfromtheperspectiveofbothclimatechange
arangeofassociatedhumanproblemsincludingdestruction 

Score: 0.47892576 Snippet: ngmoreen- portantforthecreationoftherapeuticcontexts.Therefore,this
vironmentallyconscious.However,adaptationisnotalwaysan researchwasfocusedonfinding 

Score: 0.478828 Snippet: standingofeachthemebut,being Mining9,1(122019),1–20. https://doi.org/10.1007/S13278-019-0568-8/TABLES/
time-consuming,itwasonlyusedon500comments. 4
[1 



### Step 11: Build context for the LLM

In [9]:
"""
Build the prompt for the generation step
"""

context = "\n\n---\n\n".join(retrieved_chunks)

prompt = f"""
Use the following context to answer the question. 
If the answer is not contained in the context, say 'I don't know'.

Context:
{context}

Question: {query}
Answer:
"""

### step FINAL: Prompt the LLM

In [10]:
"""
Generation step. Send everything to the generative LLM to get an output
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.0
)

answer = response.choices[0].message.content
print("Answer:\n", answer)

Answer:
 Examples of stress-inducing factors include:

1. Comments that express or promote anxiety, worry, or fear.
2. Content that provokes conflict or hostility.
3. Content that focuses on problems or negative events such as climate-related hazards.
4. Statements criticizing capitalism and its impact on survival.
5. Comments about the negative consequences of burning oil and gas.

Examples of stress-reducing factors include:

1. Comments that offer support, empathy, or encouragement.
2. Content that encourages relaxation, calmness, or positive emotions.
3. Comments that provide advice or solutions to problems.
4. Suggestions for combining nuclear power with renewables for a reliable energy supply.
5. Discussions about the benefits of transitioning to renewable energy sources.
