In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=300,
    chunk_overlap=30,
)

loader = UnstructuredFileLoader("./files/BAU_Report_May.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)


In [7]:
results = vectorstore.similarity_search("disability support")

results

[Document(page_content='New Improvements\nCanvas access improvements by the Resource Team have been completed. The business case for material access improvements, led by Rosemary, is in progress. Holiday pay calculations for staff, led by Rosemary, have achieved payment, with the query ongoing.\nNew Processes\nThe triage process for new DSS students, led by Corinne and Rosemary, has been completed. Test exam readiness, led by Corinne and Rosemary, is also completed. The DSS resource request automation review, led by Rosemary and N Daley-Jones, is in progress with priority. The confidentiality and privacy form completion by the Resource Team is completed. A compulsory training session request for SAR, led by Jo Wilkins and Rosemary, is in progress. An online presentation to DSS Advisors, led by Julie and Rosemary, has been completed. The international student support review by the Resource Team is completed.\nInternational Support\nFaculty engagement with workshops and presentations is 