# Alpha - NDCG

In [None]:
from dotenv import load_dotenv
import os
from rag_pipeline import create_simple_retriever, load_questions_and_answers_from_opensearch
from utils import create_opensearch_client
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
def compute_ng(I, alpha, r):
    """
    Compute the novelty-biased gain at r ng(r) as defined in https://link.springer.com/referenceworkentry/10.1007/978-1-4899-7993-3_80619-1.
    
    Parameters:
    - I: A 2D list or array where I[i][r] = I_i(r) is the relevance of the document at rank r for intent i.
    - alpha: The parameter alpha to be used in the computation.
    - r: The rank at which ng(r) is to be computed.
    
    Returns:
    - ng_r: The computed ng(r) value.
    """
    ng_r = 0
    for i in range(len(I)):  # Iterate over all intents
        if r > 0:
            C_i_r_minus_1 = np.sum(I[i][:r])  # C_i(r-1) is the sum of relevance scores up to rank r-1
            ng_r += I[i][r-1] * (1 - alpha)**C_i_r_minus_1
    
    return ng_r


In [8]:
load_dotenv()
opensearch_user = os.getenv('OPENSEARCH_USER')
opensearch_password = os.getenv('OPENSEARCH_PASSWORD')
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
def embed_text_with_openai(texts, model="text-embedding-3-small"):
    embeddings = []
    for text in texts:
        embeddings.append(client.embeddings.create(input = [text], model=model).data[0].embedding)
    return embeddings

In [None]:
def compute_similarity_matrix(intents_embeddings, docs_embeddings, threshold):
    # Initialize the matrix I with zeros (intents x docs)
    num_intents = len(intents_embeddings)
    num_docs = len(docs_embeddings)
    I = np.zeros((num_intents, num_docs))

    # Loop over each intent and document, compute cosine similarity, and apply threshold
    for i, intent_embedding in enumerate(intents_embeddings):
        for j, doc_embedding in enumerate(docs_embeddings):
            similarity = cosine_similarity([intent_embedding], [doc_embedding])[0][0]
            if similarity >= threshold:
                I[i, j] = 1

    return I

In [None]:
def compute_alpha_ndcg(ng, k):
    # sum up ng(r) / log_2(r+1) for r = 1 to k
    alpha_ndcg = 0
    for r in range(1, k+1):
        alpha_ndcg += ng[r-1] / np.log2(r+1)
    return alpha_ndcg

In [None]:
opensearch_client = create_opensearch_client(username=opensearch_user, password=opensearch_password)
opensearch_url = "https://opensearch-ds-2.ifi.uni-heidelberg.de:443"

kb_index_name = "eur-lex-diversified-knowledge-base-3"
qa_index_name = "eur-lex-diversified-qa-askep"
k = 10
nr_questions = 1000
threshold = 0.5
alpha = 0.5
alpha_ndcg_scores = []

# create retriever
simple_retriever = create_simple_retriever(kb_index_name, opensearch_url, k=k)
# load questions and answers
questions_and_answers = load_questions_and_answers_from_opensearch(qa_index_name, opensearch_client, size=nr_questions)
# extract the keys from the each answer
for i, (question, ground_truth_answer) in enumerate(questions_and_answers):
    # use the section titles as intents
    intents = list(ground_truth_answer.keys())
    # embed section titles
    intents_embeddings = embed_text_with_openai(intents)
    # retrieve the top k documents
    docs = simple_retriever.invoke(question)
    # embed the documents
    doc_embeddings = embed_text_with_openai([doc.page_content for doc in docs])
    # compute the similarity matrix
    I = compute_similarity_matrix(intents_embeddings, doc_embeddings, threshold=threshold)
    # compute ng for each rank r
    ng = np.zeros(k)
    for r in range(k):
        ng[r] = compute_ng(I, alpha, r)
    # compute alpha-ndcg
    alpha_ndcg = compute_alpha_ndcg(ng, k)
    alpha_ndcg_scores.append(alpha_ndcg)

In [None]:
alpha_ndcg_scores