# Mini Project

In [1]:
import logging

from storage_functions import create_astra_vstore, load_pdf, chunk_docs, load_docs_from_db
from eval_functions import make_test_set, add_llm_answers_to_dataset, evaluate_dataset
from explain_functions import format_answer
from llm_functions import perform_rag

# Embedding Models
from langchain_openai import OpenAIEmbeddings

# LLM Models
from langchain_openai import ChatOpenAI

# Explainability
from IPython.display import HTML


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmachalek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
logging.basicConfig(level=logging.WARNING)
logging.info("logging info")


INFO:root:logging info


In [11]:
# Predeclare values
file_path = "aha_hcm_guidance_2020.pdf"
emb_model = "text-embedding-3-small"
openai_embedding = OpenAIEmbeddings(model = emb_model)
llm = ChatOpenAI(model = "gpt-3.5-turbo")
collection_name = "aha_guidelines"
score_threshold = 0.7


In [4]:
def vstore_pipeline():
    """
    Performs a pipeline of operations to create and populate an Astra VStore.

    Returns:
        astra_vstore (AstraVStore): The created Astra VStore object.
        docs (list): The list of loaded PDF documents.
    """
    astra_vstore = create_astra_vstore(embedding=openai_embedding, collection_name=collection_name)
    logging.info("created vstore")
    astra_vstore.clear()
    docs = load_pdf(file_path)
    logging.info("loaded pdf")
    chunks = chunk_docs(docs, chunks_per_page=3, embedding=openai_embedding)
    logging.info("docs chunked")
    astra_vstore.add_documents(chunks)
    return astra_vstore, docs

In [17]:
def eval_pipeline(vstore, docs):
    """
    Generate a test set using the docs and evaluate it. Output results to xlsx file.
    
    Args:
        vstore (VStore): The vstore object used as a retriever.
        docs (list): The list of documents to evaluate.
    
    Returns:
        dict: A dictionary containing the averaged evaluation results.
    """
    retriever = vstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': score_threshold})
    test_set = make_test_set(docs, test_size=10, llm=llm, embedding=openai_embedding)
    test_set = add_llm_answers_to_dataset(test_set, retriever, llm)
    averaged_results = evaluate_dataset(test_set)
    logging.info("The averaged results are: {}".format(averaged_results))
    return averaged_results

In [6]:
vstore, docs = vstore_pipeline()

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:astrapy.db:ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'
INFO:astrapy.db:ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'
INFO:httpx:HTTP Request: POST https://1da26773-d99d-4e8f-a3ce-5d34f0405b09-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace "HTTP/1.1 200 OK"
INFO:root:created vstore
INFO:httpx:HTTP Request: POST https://1da26773-d99d-4e8f-a3ce-5d34f0405b09-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/aha_guidelines "HTTP/1.1 200 OK"
INFO:root:loaded pdf
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.op

In [14]:
retriever = vstore.as_retriever(search_type = "similarity_score_threshold", search_kwargs = {'score_threshold': score_threshold})
question = "If my patient has hypertrophic cardiomyopathy what else are they susceptible to?"
answer, context = perform_rag(question, llm, retriever)
display(HTML(format_answer(question, answer, vstore, score_threshold)))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://1da26773-d99d-4e8f-a3ce-5d34f0405b09-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/aha_guidelines "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://1da26773-d99d-4e8f-a3ce-5d34f0405b09-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/aha_guidelines "HTTP/1.1 200 OK"


In [18]:
avg_result = eval_pipeline(vstore, docs)

embedding nodes:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-612' coro=<AsyncClient.aclose() done, defined at c:\Users\jmachalek\.virtualenvs\mini_project-kSexw87t\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\jmachalek\.virtualenvs\mini_project-kSexw87t\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\jmachalek\.virtualenvs\mini_project-kSexw87t\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\jmachalek\.virtualenvs\mini_project-kSexw87t\Lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "c:\Users\jmachalek\.virtualenvs\mini_project-kSexw87t\Lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  F

Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [28]:
display(avg_result)

{'context_precision': 1.0000, 'context_recall': 1.0000, 'answer_correctness': 0.7652}

In [None]:
#vstore.delete_collection()