In [1]:
from dotenv import load_dotenv
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
import pandas as pd
import requests
import os


In [3]:
loader = PyPDFLoader("papers\Space, Time and Einstein ( PDFDrive ).pdf")
document = loader.load()

In [5]:
len(document)

255

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
texts = text_splitter.split_documents(document)

In [23]:
def cleanText(texts):
    rawText = []
    for i in texts:
        rawText.append(str(i.page_content).replace("\n"," "))
    return rawText

In [24]:
cleanedText = cleanText(texts)
cleanedText

['Space, Time and Einstein',
 'Space, Time and Einstein An Introduction J. B. Kennedy',
 '© J. B. Kennedy 2003 This book is copyright under the Berne Convention. No reproduction without permission.All rights reserved. First published in 2003 by AcumenAcumen Publishing Limited 15A Lewins YardEast StreetChesham  HP5 1HQwww.acumenpublishing.co.uk ISBN: 1-902683-66-8 (hardcover) ISBN: 1-902683-67-6 (paperback) British Library Cataloguing-in-Publication Data A catalogue record for this book is available from the British Library. Designed and typeset by Kate Williams, Abergavenny. Printed and bound by Biddles Ltd., Guildford and King’s Lynn.',
 'For Carole and John Crascall',
 'viiContents Preface and acknowledgements ix Part I: Einstein’s revolution 1 1 From Aristotle to Hiroshima 3 2 Einstein in a nutshell 73 The twin paradox 314 How to build an atomic bomb 405 The four-dimensional universe 506 Time travel is possible 667 Can the mind understand the world? 71 Part II: Philosophical progres

In [26]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(cleanedText)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [31]:
chromadb = chromadb.Client()

In [32]:
persistDirectory = "/db/chroma"
vectordb = chromadb.create_collection("Einstein")

In [38]:
vectordb.add(
    embeddings=embeddings,
    documents=cleanedText,
    metadatas=[pages.metadata for pages in texts],
    ids=[f"id+{str(i)}" for i in range(len(texts))]
)

In [45]:
vectordb.query(
    query_texts=["Who is einstein"],
    n_results=2
)['documents']

[['Among physicists, Einstein is at times remembered as a grumpy, cutting and arrogant fellow with little patience for family orcolleagues. He so annoyed his teachers at university that he failed tosecure a job in academia, and had to scramble to find low-payingwork in the Swiss patent office (although some say that being Jewishhurt his chances too). During his twenties in Berne, Einstein was afashionable man about town. His wit and violin playing brought himmany dinner invitations, and he formed a reading group with friendsto study the work of Kant, Schopenhauer and other philosophers. In1905, his miracle year, he published several unrelated papers. Onewas good enough to win a Nobel prize, and another revolutionizedour views of space and time. The 25-year-old patent clerk had remadephysics in his own image. Einstein’s 1905 theory of space and time is now called the  special theory of relativity . The word “relativity” refers to relative speeds and',
  'Einstein’s writings Einstein’s o