In [1]:
import os
from dotenv import load_dotenv
from opensearchpy import OpenSearch, RequestsHttpConnection
from utils import create_opensearch_client

In [2]:
# Load environment variables
opensearch_user = os.getenv('OPENSEARCH_USER')
opensearch_password = os.getenv('OPENSEARCH_PASSWORD')

# Connect to OpenSearch
client = create_opensearch_client(username=opensearch_user, password=opensearch_password)
embedding_dim =  1536

# Define the new index mapping with knn vector search settings
new_index_mapping = {
    "settings": {
        "index": {
            "knn": True 
        }
    },
    "mappings": {
        "properties": {
            "url": {"type": "keyword"},
            "chunk_id": {"type": "integer"},
            "text": {"type": "text", "fielddata": False},
            "embedding": {"type": "knn_vector", "dimension": embedding_dim}  
        }
    }
}

# Create the new index
new_index_name = "eur-lex-diversified-knowledge-base-3"
old_index_name = "eur-lex-diversified-knowledge-base-2"
client.indices.create(index=new_index_name, body=new_index_mapping, ignore=400)

# Define the reindex request
reindex_body = {
    "source": {
        "index": old_index_name
    },
    "dest": {
        "index": new_index_name
    }
}

# Reindex the documents from the old index to the new one
response = client.reindex(body=reindex_body)
print("Reindex response:", response)

Reindex response: {'took': 16740, 'timed_out': False, 'total': 14914, 'updated': 0, 'created': 14914, 'deleted': 0, 'batches': 15, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []}


In [3]:
response = client.count(index=new_index_name)

# Extract and print the document count
doc_count = response['count']
print(f"Number of documents in the new index '{new_index_name}': {doc_count}")

Number of documents in the new index 'eur-lex-diversified-knowledge-base-3': 14914


Check that similarity search works.

In [2]:
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_openai import OpenAIEmbeddings, OpenAI
from utils import create_opensearch_client

In [4]:
openai_api_key = os.getenv('OPENAI_API_KEY')
opensearch_user = os.getenv('OPENSEARCH_USER')
opensearch_password = os.getenv('OPENSEARCH_PASSWORD')
opensearch_client = create_opensearch_client(username=opensearch_user, password=opensearch_password)

In [5]:
def load_questions_and_answers_from_opensearch(qa_index_name, opensearch_client, size=1):
    query = {
        "query": {
            "match_all": {}
        },
        "size": size
    }

    response = opensearch_client.search(index=qa_index_name, body=query)

    questions_and_answers = []
    for hit in response['hits']['hits']:
        question = hit['_source']['question']
        answer = hit['_source']['answer']
        questions_and_answers.append((question, answer))
    
    return questions_and_answers

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key= openai_api_key, model="text-embedding-3-small")
opensearch_url = "https://opensearch-ds-2.ifi.uni-heidelberg.de:443"
qa_index_name = "eur-lex-diversified-qa-askep"
k = 1

    
vector_store = OpenSearchVectorSearch(
    index_name=new_index_name, 
    embedding_function=embeddings,
    vector_field="embedding",
    opensearch_url=opensearch_url,
    http_auth=(opensearch_user, opensearch_password),
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

questions_and_answers = load_questions_and_answers_from_opensearch(qa_index_name, opensearch_client, size=1)
for question, ground_truth_answer in questions_and_answers:
        docs = vector_store.similarity_search(query=question, k=k, vector_field="embedding")
        for doc in docs:
            print(doc.page_content)

In [None]:
import numpy as np
from langchain_core.documents import Document
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
from typing import Any

In [30]:
def max_marginal_relevance_search(
        vector_store: VectorStore,
        query: str,
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        **kwargs: Any,
    ) -> list[Document]:
        """Return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
                     Defaults to 20.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """

        vector_field = kwargs.get("vector_field", "vector_field")
        text_field = kwargs.get("text_field", "text")
        metadata_field = kwargs.get("metadata_field", "metadata")

        # Get embedding of the user query
        embedding = vector_store.embedding_function.embed_query(query)

        # Do ANN/KNN search to get top fetch_k results where fetch_k >= k
        results = vector_store._raw_similarity_search_with_score_by_vector(
            embedding, fetch_k, **kwargs
        )

        embeddings = [result["_source"][vector_field] for result in results]

        # Rerank top k results using MMR, (mmr_selected is a list of indices)
        mmr_selected = maximal_marginal_relevance(
            np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult
        )
        print(len(mmr_selected))

        return [
            Document(
                page_content=results[i]["_source"][text_field],
                id=results[i]["_id"],
            )
            for i in mmr_selected
        ]

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k, "vector_field": "embedding"})
mmr_retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={
    'k': 6, 
    'lambda_mult': 0.5, 
    "vector_field": "embedding", 
    "metadata_field": None,
})
for question, ground_truth_answer in questions_and_answers:
    retrieved_docs = mmr_retriever.invoke(question)
    for doc in retrieved_docs:
        print(doc.page_content)

In [None]:
k = 4
for question, ground_truth_answer in questions_and_answers:
    docs = max_marginal_relevance_search(vector_store, query=question, k=k, vector_field="embedding")
    for i, doc in enumerate(docs):
        print(f"Doc nr: {i+1}\n\n")
        print(doc.page_content)

Generation part

In [6]:
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA

In [17]:
embeddings = OpenAIEmbeddings(openai_api_key= openai_api_key, model="text-embedding-3-small")
opensearch_url = "https://opensearch-ds-2.ifi.uni-heidelberg.de:443"
qa_index_name = "eur-lex-diversified-qa-askep"
kb_index_name = "eur-lex-diversified-knowledge-base-3"
k = 2

    
vector_store = OpenSearchVectorSearch(
    index_name=kb_index_name, 
    embedding_function=embeddings,
    vector_field="embedding",
    opensearch_url=opensearch_url,
    http_auth=(opensearch_user, opensearch_password),
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k, "vector_field": "embedding"})

In [22]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Question: {question}
    The following documents provide relevant information: {context}
    Please answer the question only by using the provided information. Make sure to provide a diversified response that covers different perspectives and details from the provided documents. Your answer should include multiple viewpoints and insights from the context, not just a single perspective. If necessary, highlight different interpretations, opinions, or additional context that is relevant to the question.
    Answer the question comprehensively, using the information from the documents provided.
    """
)

In [None]:
llm = OpenAI(openai_api_key = openai_api_key)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template},
    verbose=True
)

questions_and_answers = load_questions_and_answers_from_opensearch(qa_index_name, opensearch_client, size=1)
for i, (question, ground_truth_answer) in enumerate(questions_and_answers):
    print(f"Question: {question}")
    print("-"*50)
    print(f"Ground truth answer: {ground_truth_answer}")
    print("-"*50)
    response = qa_chain({"query": question})
    # print("Retrieved Documents:")
    # for doc in response["source_documents"]:
    #     print(f"- {doc.page_content}")
    #     print("-" * 50)
    print(response["result"])
    print("-"*50)