In [9]:
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [10]:
data_folder = Path("/content/DATA")
documents = []
for file_path in data_folder.glob("*.txt"):
    loader = TextLoader(str(file_path), encoding='utf-8')
    docs = loader.load()
    for doc in docs:
        doc.metadata["source"] = file_path.name
    documents.extend(docs)

In [11]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)

In [12]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
db = FAISS.from_documents(chunks, embeddings)
db.save_local("vectorstore/legal_db")

In [14]:
# prompt: load the db from files and do a similartity search

loaded_db = FAISS.load_local("vectorstore/legal_db", embeddings,allow_dangerous_deserialization=True)

query = "What is the definition of 'personal data'?"
docs = loaded_db.similarity_search(query)

for doc in docs:
    print(f"Source: {doc.metadata['source']}")
    print(f"Content: {doc.page_content}")
    print("-" * 50)

Source: criminal_procedure_code.txt
Content: Of Cheating
415.	Cheating:
Whoever, by deceiving any person, fraudulently or dishonestly induces the person so deceived to deliver any property to any person, or to consent that any person shall retain any property, or intentionally induces the person so deceived to do or omit to do anything which he would not do or omit if he were not so deceived, and which act or omission causes or is likely to cause damage or harm to that person  165[or any other person] 165 in body, mind, reputation or property, is said to "cheat".
Explanation: A dishonest concealment of facts is a deception within the meaning of this section.
--------------------------------------------------
Source: civil_procedure_code.txt
Content: matters of fact from which the same may be implied, and not as a denial of the legality or
sufficiency in law of such contract.
9. Wherever the contents of any document are material, it shall be sufficient in any
pleading to state the effec