In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma

In [2]:
loader = DirectoryLoader('./PaulGrahamEssaysLarge/', glob="**/*.txt", show_progress=True)

docs = loader.load()

100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [01:09<00:00,  1.42s/it]


In [3]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(docs)

print (f"Your {len(docs)} documents have been split into {len(splits)} chunks")

Your 49 documents have been split into 1421 chunks


In [4]:
embedding = HuggingFaceEmbeddings(
    model_name = 'all-MiniLM-L6-v2'
)

if 'db' in globals():
    db.delete_collection()

db = Chroma.from_documents(
    documents=splits, 
    embedding=embedding, 
    persist_directory="./db/"
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
question = "What is the authors view on the early stages of a startup?"
context = ""

for doc in db.similarity_search(question, k = 3):
    print(doc.metadata['source'])
    print("------------------------")
    print(" ".join(doc.page_content.split("\n")))
    print("------------------------")
    context += doc.metadata['source'] + "\n"
    context += " ".join(doc.page_content.split("\n")) +"\n\n"

PaulGrahamEssaysLarge\startuplessons.txt
------------------------
April 2006(This essay is derived from a talk at the 2006  Startup School. )The startups we've funded so far are pretty quick, but they seem  quicker to learn some lessons than others. I think it's because  some things about startups are kind of counterintuitive.We've now  invested  in enough companies that I've learned a trick  for determining which points are the counterintuitive ones:  they're the ones I have to keep repeating.So I'm going to number these points, and maybe with future startups
------------------------
PaulGrahamEssaysLarge\startuplessons.txt
------------------------
business risk, accelerate time-to-value and sustain lower total  cost of ownership.  An established company may get away with such an opaque description,  but no startup can. A startup  should be able to explain in one or two sentences exactly what it  does.  [4]  And not just to users. You need this for everyone:  investors, acquirers, par

In [6]:
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}\
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [7]:
print(PROMPT.format_prompt(context=context, question=question).to_string())

Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

PaulGrahamEssaysLarge\startuplessons.txt
April 2006(This essay is derived from a talk at the 2006  Startup School. )The startups we've funded so far are pretty quick, but they seem  quicker to learn some lessons than others. I think it's because  some things about startups are kind of counterintuitive.We've now  invested  in enough companies that I've learned a trick  for determining which points are the counterintuitive ones:  they're the ones I have to keep repeating.So I'm going to number these points, and maybe with future startups

PaulGrahamEssaysLarge\startuplessons.txt
business risk, accelerate time-to-value and sustain lower total  cost of ownership.  An established company may get away with such an opaque description,  but no startup can. A startup  should be able to explain in one or two sentences exactly what it 