In [None]:
from cag.embeddings import SentenceTransformerEmbeddings
from cag.models import ChatOllama

embeddings_model = SentenceTransformerEmbeddings('sentence-transformers/all-mpnet-base-v2')
model = ChatOllama(model = 'llama3.2', temprature = 0.01)

  from tqdm.autonotebook import tqdm, trange





In [47]:
import numpy as np

def answer_relevancy(generated_answer, original_query):
    
    prompt = """i will give you a answer , please generate three question which we can derive from that answer.
    use this format for generation :  start the generation with "---" and end it with "---" too ; between the questions you should include "---" as well . like this format bellow : 
    
    ---
    Question number 1
    ---
    Question number 2 
    ---
    Question number 3
    --- 
    
    Here is the answer : 
    Answer : {answer}"""
    
    prompt = prompt.format(answer=generated_answer)
    
    
    generated_questions = model.invoke(prompt).content

    
    generated_questions = [item for item in generated_questions.split('---') if len(item) > 7]

    #embed the question
    generated_questions = [embeddings_model.embed_query(question) for question in generated_questions]
    
    #embed the query
    original_query = embeddings_model.embed_query(original_query)
    
    generated_questions, original_query = np.array(generated_questions), np.array(original_query)
    
    # Normalize vectors
    vec1_norm = original_query / np.linalg.norm(original_query)
    vec_list_norm = generated_questions / np.linalg.norm(generated_questions, axis=1, keepdims=True)

    # Compute cosine similarity
    cosine_sim = np.dot(vec_list_norm, vec1_norm)
    
    return np.mean(cosine_sim)
    

In [55]:
def context_relevancy(retrieved_context, original_query):
    
    prompt = """this is a context relevancy test. for the given context and question , extract each sentence of the context and determine if that sentence can potentially be helpful to answer the question. for every sentence , describe the relevancy of that sentence and answer in YES or NO terms which that sentence can be helpful to answer the question or not. 
    
    use this format : 
    
    Sentence  : a simple description of relevancy to the question : YES or NO
    
    Here is Question 
    Question : {query}
    
    Here is the Context :
    Context : {context}"""
    
    prompt = prompt.format(query = original_query, context = retrieved_context)
    
    output = model.invoke(prompt).content
    
    output = output.lower()
    
    score = output.count('yes') / (output.count('yes') + output.count('no'))
    
    return score

# Loading CRSB and SQUAD

In [65]:
import json 
import datasets

with open('F:\\OneDrive\\Desktop\\Research\\Dataset\\CRSB-Texts.json', 'r') as f:
    crsb = json.load(f)
    
squad = datasets.load_dataset('rajpurkar/squad')

In [66]:
crsb = crsb['amazon_rainforest']


In [67]:
squad = squad['validation'].shuffle()

In [70]:
squad = squad[:100]

#this makes squad a dict like object with keys and values , values are lists

In [74]:
print(crsb.keys())
print(squad.keys())

dict_keys(['contents', 'questions'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])


In [75]:
print(len(crsb['contents']))
print(len(squad['question']))

100
100


In [79]:
contexts = crsb['contents']
questions = squad['question']

# RAG Evaluation on CRSB + SQUAD

In [80]:
from langchain.vectorstores import FAISS

retriever = FAISS.from_texts(texts=contexts,
                             embedding= embeddings_model)

In [84]:
from time import time

crs = []
ars = []

for i, question in enumerate(questions):
    
    start = time()
    retrieved_context = retriever.similarity_search(query=question, k =1)
    ar = answer_relevancy(retrieved_context, question)
    cr = context_relevancy(retrieved_context, question)
    
    crs.append(cr)
    ars.append(ar)
    
    end = time()
    print(f'Question {i} processed in {end - start} seconds')
    print(f'CR score: {cr}, AR score: {ar}')


Question 0 processed in 49.16196537017822 seconds
CR score: 0.0, AR score: 0.1571698437275094
Question 1 processed in 48.54631495475769 seconds
CR score: 0.0, AR score: 0.29480998367364086
Question 2 processed in 38.27165222167969 seconds
CR score: 0.0, AR score: 0.33107373609736607
Question 3 processed in 31.744757175445557 seconds
CR score: 0.0, AR score: 0.28770980683478337
Question 4 processed in 47.549818992614746 seconds
CR score: 0.0, AR score: 0.11949658150158697
Question 5 processed in 43.27785325050354 seconds
CR score: 0.0, AR score: 0.11400199500216968
Question 6 processed in 46.08327388763428 seconds
CR score: 0.0, AR score: 0.09142471596089385
Question 7 processed in 38.67910647392273 seconds
CR score: 0.0, AR score: 0.3215760763718075
Question 8 processed in 52.183584451675415 seconds
CR score: 0.0, AR score: 0.17627196068181208
Question 9 processed in 48.641218423843384 seconds
CR score: 0.0, AR score: 0.1451801908734139
Question 10 processed in 40.68730044364929 second

In [None]:
from time import time

crs = []
ars = []

for i, question in enumerate(questions):
    
    start = time()
    retrieved_context = retriever.similarity_search(query=question, k =1)
    ar = answer_relevancy(retrieved_context, question)
    cr = context_relevancy(retrieved_context, question)
    
    crs.append(cr)
    ars.append(ar)
    
    end = time()
    print(f'Question {i} processed in {end - start} seconds')
    print(f'CR score: {cr}, AR score: {ar}')


In [85]:
ars, crs = np.array(ars), np.array(crs)

print(f'ARs mean : {np.mean(ars)}')
print(f'CRs mean : {np.mean(crs)}')

ARs mean : 0.20629166428308185
CRs mean : 0.016049019607843138


# CAG Evaluation on CRSB + SQUAD