In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

DATA_PATH = "data/"
DB_FAISS_PATH = "vectorstores/db_chroma"

In [62]:
loader = DirectoryLoader(DATA_PATH , glob = '*.pdf' , loader_cls = PyPDFLoader)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000 , chunk_overlap = 15)
texts = text_splitter.split_documents(documents)

In [63]:
print(loader)

<langchain.document_loaders.directory.DirectoryLoader object at 0x000001BA450F5640>


In [64]:
print(documents)



In [65]:
print(len(texts))

140


In [66]:
count = 0
for text in texts:
    print(text)
    print("************************")

page_content='Download free eBooks of classic literature, books and \nnovels at Planet eBook. Subscribe to our free eBooks blog \nand email newsletter.Oliver T wist\nBy Charles Dickens' metadata={'source': 'data\\Oliver Twist.pdf', 'page': 0}
************************
page_content='Oliver Twist \x18CHAPTER I  \n \nTREATS OF THE PLACE \nWHERE OLIVER TWIST \nW AS BORN AND OF \nTHE CIRCUMSTANCES \nATTENDING HIS BIRTH\nAmong other public buildings in a certain town, which \nfor many reasons it will be prudent to refrain from men -\ntioning, and to which I will assign no fictitious name, there \nis one anciently common to most towns, great or small: to \nwit, a workhouse; and in this workhouse was born; on a day \nand date which I need not trouble myself to repeat, inas -\nmuch as it can be of no possible consequence to the reader, \nin this stage of the business at all events; the item of mortal -\nity whose name is prefixed to the head of this chapter.\nFor a long time after it was ushered

In [68]:
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2' , model_kwargs = {'device' : 'cpu'})

In [69]:
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={'device': 'cpu'} encode_kwargs={} multi_process=False


In [70]:
# Embed and store the texts
# Supplying a persist directory will store the embeddings on disk

persist_directory = "vectorstores\db_chroma"

vectordb = Chroma.from_documents(documents = texts , 
                                 embedding = embeddings ,
                                persist_directory = persist_directory)


In [71]:
# Persist the db to disk
vectordb.persist()
vectordb = None

In [73]:
# Now we can load the persisted database from the disk and use it as normal

vectordb = Chroma(persist_directory = persist_directory , 
                 embedding_function = embeddings)

In [74]:
vectordb

<langchain.vectorstores.chroma.Chroma at 0x1ba3e45a2b0>