In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
import os

In [6]:
loader = DirectoryLoader(
    path="Sherlock",
    glob="*.txt",
    loader_cls=lambda path: TextLoader(path, encoding="utf-8"),
    show_progress=True
)
docs = loader.load()

100%|██████████| 10/10 [00:00<00:00, 608.53it/s]


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    length_function=len,
    chunk_size=1500,
    chunk_overlap=150,
    is_separator_regex=True,
    separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
)
splits = text_splitter.split_documents(docs)

In [8]:
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs=encode_kwargs,
#    model_kwargs=model_kwargs,
    show_progress=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embeddings,
    persist_directory="./data/chroma_db"
)
vector_store.add_documents(splits)

  vector_store = Chroma(
Batches: 100%|██████████| 113/113 [1:07:14<00:00, 35.70s/it]


['68def9ab-5d77-46ee-a65e-e5bcc8df7287',
 '9d9af54e-cbbb-41d5-897b-3574011ad099',
 'b02a5012-4a0b-4a1e-acb6-9e8fb27bdb65',
 '4e666945-d29a-4d7a-8ecc-a33743c20f0c',
 'f750f49c-dbde-4c12-a248-634ae936a8ee',
 '92fd134e-96d9-48f9-a4cb-040f64754736',
 '996b25e7-6dd5-471d-bc15-3b123518df81',
 '65d8c1c0-9021-417b-9290-a688f0608837',
 'b1cc203d-3679-48a6-884e-837bce36c57a',
 'd0fcfb86-c698-4e43-a9a9-58215c9e5369',
 '13c38962-44eb-4f4a-a0fc-1c8375fbccab',
 '61e52afd-a931-4807-9aa7-5e66544158fc',
 'ed86bb14-3d5e-45b1-9a40-d54514864a6e',
 'e84e7da9-0b02-42eb-ade4-490c5d615378',
 '378322a5-bd6a-4f64-ac2d-48095534cdb0',
 '43db87e5-4701-4e16-a348-a42c055582c1',
 '20e6189a-a834-4017-b895-1bfb0efb7306',
 'bb095523-cc6a-4be1-9fcd-2d11e903d7a2',
 'dcb70bff-35e0-414b-be44-9b6bcecc2d03',
 '909574a6-90d8-44db-8cd1-cf6ed55d7805',
 'ccea143e-18c4-43ef-b261-4d82ba63848b',
 'a906758d-4984-46c2-afb4-dc96d5977475',
 '1d747ed7-d8e0-4ac0-bfdd-2496a5eb3ab7',
 '63036b0c-f92d-4b8d-ac41-78946893ff1b',
 '78b83dec-dd6f-

In [11]:
results = vector_store.similarity_search(
    query="What is Scarlet?",
    k=4
)

for res in results:
    print(f"{res.page_content} | {res.metadata}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]

The Project Gutenberg eBook of A Study in Scarlet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: A Study in Scarlet

Author: Arthur Conan Doyle

Release date: April 1, 1995 [eBook #244]
                Most recently updated: April 17, 2023

Language: English

Credits: Roger Squires and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***




A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.

 PART I.
 CHAPTER I. MR. SHERLOCK HOLMES.
 CHAPTER II. THE SCIENCE OF DEDUCTION.
 CHAPTER III. THE LAURISTON GARDENS MYSTERY
 CHAPTER IV. WHAT JOHN




In [12]:
results = vector_store.similarity_search(
    query="is the hound of the baskervilles real?",
    k=2
)

for res in results:
    print(f"{res.page_content} | {res.metadata}")

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]

“And this Cavalier opposite to me—the one with the black velvet
      and the lace?”

      “Ah, you have a right to know about him. That is the cause of all
      the mischief, the wicked Hugo, who started the Hound of the
      Baskervilles. We’re not likely to forget him.”

      I gazed with interest and some surprise upon the portrait.

      “Dear me!” said Holmes, “he seems a quiet, meek-mannered man
      enough, but I dare say that there was a lurking devil in his
      eyes. I had pictured him as a more robust and ruffianly person.”

      “There’s no doubt about the authenticity, for the name and the
      date, 1647, are on the back of the canvas.”

      Holmes said little more, but the picture of the old roysterer
      seemed to have a fascination for him, and his eyes were
      continually fixed upon it during supper. It was not until later,
      when Sir Henry had gone to his room, that I was able to follow
      the trend of his thoughts. He led me back into the
   


