In [2]:
# ! pip install langchain langchain-text-splitters lxml tiktoken faiss-cpu sentence-transformers langchain-community

In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [2]:
data_dir = "./philosophers"

In [3]:
files = os.listdir(data_dir)
file_texts = []
for file in files:
    with open(f"{data_dir}/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1024, chunk_overlap=128, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i})) 

Created a chunk of size 1225, which is longer than the specified 1024
Created a chunk of size 1052, which is longer than the specified 1024
Created a chunk of size 1787, which is longer than the specified 1024
Created a chunk of size 1149, which is longer than the specified 1024
Created a chunk of size 1193, which is longer than the specified 1024
Created a chunk of size 2094, which is longer than the specified 1024
Created a chunk of size 1324, which is longer than the specified 1024
Created a chunk of size 1717, which is longer than the specified 1024


In [4]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [5]:
embeddings = HuggingFaceEmbeddings() # embed your data

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [6]:
# store the embedded data into a vector database
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [7]:
retriever = vector_store.as_retriever()

In [9]:
from dotenv import load_dotenv

load_dotenv()

True

In [10]:
from langchain_openai import OpenAI
llm = OpenAI()

In [11]:
from langchain.prompts import ChatPromptTemplate
template="""You are a enthusiast on philosophy teaching about well known philosophers. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [12]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
response = chain.invoke("When did Plato live?")

In [17]:
response

' Plato lived between 428 and 348 BC.'