In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

In [6]:
def load_vector_store(path):
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={'device': 'cuda'}
    )

    load_vector_store = Chroma(
        persist_directory=path,
        embedding_function=embeddings)

    return load_vector_store

In [7]:
def query_vector_store(vector_store, query, k=3):
    res = vector_store.similarity_search_with_score(query, k=k)
    return res

In [8]:
def filter_results_by_score(results, threshold=0.3):
    filtered = []
    for doc, score in results:
        if score < threshold:
            print(f"SKIPPING: Document: {doc.metadata} Score: {score}")
        else:
            filtered.append((doc, score))
    return filtered

In [9]:
def do_rag(vector_store, query, k=3, score_threshold=0.3):
    results = query_vector_store(vector_store, query, k=k)
    filtered_results = filter_results_by_score(results, threshold=score_threshold)
    return filtered_results

In [16]:
vector_store = load_vector_store("/home/kuba/projects/usc-policy-knowledge-base/db/chroma")
query = "Can i be in a class room before the next class starts?"
do_rag(vector_store, query, k=3, score_threshold=0.3)

[(Document(metadata={'source': '/home/kuba/projects/usc-policy-knowledge-base/data/usc-policy-docs/md/acaf315.md'}, page_content='ADMINISTRATIVE DIVISION POLICY NUMBER ACAF Academic Affairs ACAF 3.06 POLICY TITLE Classroom Scheduling SCOPE OF POLICY DATE OF REVISION/REVALIDATED USC System December 10, 2024 RESPONSIBLE OFFICER ADMINISTRATIVE OFFICE Executive Vice President for Academic Affairs and University Registrar Provost\n\nPURPOSE\n\nThe purpose of this policy is to optimize access to classes for students and utilization of classroom space.\n\nDEFINITIONS\n\nClassrooms: space not equipped exclusively for labs or seminars that is used for instruction and is defined as such for the purpose of federal and state reporting. To be so classified, at least 50% of its use must be for regularly scheduled course instruction. The University centrally schedules all general-purpose classroom space, although academic units may have priority scheduling rights in selected rooms.\n\nStandard Meetin