In [1]:
# Used to access the saved Chroma vector database. 
from langchain_community.vectorstores import Chroma

# Used to embed the user's query with text-embedding-3-small 
# and send it to gpt-4o-mini.
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Used to get the OpenAI API key from environment variables so that 
# it's not visible in this code.
import os

In [3]:
# Get the API key.
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

# I chose not to use Pinecone, as it adds considerable complication to the code.
# os.environ["PINECONE_API_KEY"] = os.environ.get("PINECONE_API_KEY")

# Sets the directory of the Chroma DB that's being loaded from.
# At present, it's just "Chroma", but I may make seperate DBs for different embedding models.
CHROMA_PATH = "Chroma"

In [4]:
# The first step of engineering the prompt. The LLM is told to answer the question with 
# the given context. Currently, there isn't any. However, this will be formatted later
# to take this string and add the context of the relevant database chunk in place of {context}
# and add the user's query in place of {question}.
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [10]:
# Sets up the embedding model with the API key.
# Conveniently, LangChain allows for "hot-swapping" of embedding models by merely
# changing the model argument. However, using a different embedding model than the one 
# used to create the vector database will have significant negative consequences that could 
# render the chatbot inoperable, so it's essential that this matches what's used in Chroma.py.
embedder = OpenAIEmbeddings(
    model = "text-embedding-3-small",
    api_key = os.environ["OPENAI_API_KEY"]
)

In [None]:
# Load the vector database.
db = Chroma(persist_directory = CHROMA_PATH, embedding_function = embedder)

In [32]:
# CURRENTLY, THIS IS THE ONLY WAY OF QUERYING THE DB AND LLM.
# I'm looking into a package called "Streamlit", which provides a web interface for user input.
# Alternatively, I think it'd be good to somehow get this as a bot on a messaging service (Teams, Discord)
# and allow queries to flow in from there. (I have yet to do any research on Teams or Discord LLM bots.)
query = "What university is this? How do they protect my data? What are their disability support arrangements?"

In [33]:
# Finds the three most likely chunks that the user's query applies to.
results = db.similarity_search_with_relevance_scores(query, k=3)

### This sorts them in the wrong order by default, with the best chunk coming last.
### I'm not sure how to change that yet.
for doc, _score in results:
    print(_score)

# Creates the {context} previously seen in the PROMPT_TEMPLATE.
# Seperates the three most likely chunks with new lines and dashes.
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

# Uses LangChain's ChatPromptTemplate as previously mentioned to engineer a prompt
# to pass to the LLM.
# prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

# Formats the prompt to put the document context in place of {context},
# and the user query in place of {question}
prompt = PROMPT_TEMPLATE.format(context=context_text, question=query)
print(prompt)

0.40908483090004344
0.40572125950778304
0.39656313766384355

Answer the question based only on the following context:

13
Terms and conditions for taught and research students
y. The University’s Safeguarding Policy sets out the University’s serious 
commitment to safeguarding matters. Safeguarding is everyone’s 
responsibility, and any safeguarding concerns should be raised under the 
policy. The University will take any necessary action to fulfil any safeguarding 
responsibilities it has.
DISABILITY SUPPORT AND  
REASONABLE ADJUSTMENTS
21. The University is committed to providing an inclusive and accessible

---

studies in line with the University’s Data Protection Policy. The University’s 
Privacy Notices which explains what happens to any personal data you 
provide to us or that we collect from you (including your data subject rights) 
and the University’s Graduation Ceremonies Privacy Notice which concerns 
the live streaming of graduation ceremonies at the University.
t. The Uni

In [34]:
# Create the LLM object.
## Uses GPT-4o-mini for cost efficiency.
## Temperature can be changed to vary the LLMs responses.
## Loads the API key from the system environment variables.
llm = ChatOpenAI(
    model = "gpt-4o-mini",
    temperature = 0,
    api_key = os.environ["OPENAI_API_KEY"]
)

# Gets the LLM's response.
response = llm.predict(prompt)

# Attaches the document that the relevant chunks were found from.
sources = [doc.metadata.get("source", None) for doc, _score in results]

# Saves and outputs the LLM's response, and the sources used to generate it.
formatted_response = f"Response: {response}\nSources: {sources}"
print(formatted_response)

Response: The context provided does not specify the name of the university. However, it indicates that the university is committed to safeguarding matters, providing an inclusive and accessible environment for students, and has a Data Protection Policy in place. 

To protect your data, the university informs you about how your personal data, including sensitive personal data, will be used during the application process and for related purposes, ensuring compliance with data protection laws. They also have Privacy Notices that explain the handling of personal data and your rights regarding it.

Regarding disability support arrangements, the university is committed to providing inclusive and accessible studies, which suggests that they have measures in place to support students with disabilities, although specific details are not provided in the context.
Sources: ['Data\\Policies\\StudentContract.pdf', 'Data\\Policies\\StudentContract.pdf', 'Data\\Policies\\StudentContract.pdf']
