## Import libraries

In [5]:
import nest_asyncio
import qdrant_client

from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.vector_stores.qdrant import QdrantVectorStore

from IPython.display import Markdown, display
from dotenv import load_dotenv
import os

## Setup Asyncio

In [6]:
import nest_asyncio

nest_asyncio.apply()

## Define LLM, Embedding model and re-ranker model

In [7]:
llm = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model = "gpt-4o-mini",
    request_timeout=120
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

rerank = SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=2)

In [8]:
Settings.embed_model = embed_model
Settings.llm = llm

## Read the documents

In [9]:
input_dir_path = './docs/paul_graham'

loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".txt"],
            recursive=True
        )
docs = loader.load_data()

## Set up the Qdrant vector database

In [11]:
client = qdrant_client.QdrantClient(host="localhost", port=6333)
vector_store = QdrantVectorStore(client=client,
                                 collection_name="document_chat")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(docs,
                                        storage_context=storage_context)

## Define the query engine and prompt template

In [12]:
query_engine = index.as_query_engine(similarity_top_k=4,
                                     node_postprocessors=[rerank])

template = """Context information is below.
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner. Incase 
              you don't know the answer say 'I don't know!'.
              
              Query: {query_str}
              
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

## Query the document

In [13]:
response = query_engine.query("""How did the structure of funding startups 
                                 in batches contribute to the success and 
                                 growth of the Y Combinator program and the
                                 startups involved?""")

In [14]:
display(Markdown(str(response)))

The structure of funding startups in batches contributed to the success and growth of the Y Combinator (YC) program and the startups involved in several ways:

1. **Intensive Support**: By funding multiple startups at once, YC was able to focus intensively on each startup for three months. This concentrated support helped founders refine their ideas, develop their products, and prepare for future funding rounds.

2. **Peer Learning**: Startups in the same batch could learn from each other, share experiences, and collaborate. This created a community where founders could provide feedback and support, fostering a collaborative environment that enhanced innovation.

3. **Efficient Resource Allocation**: The batch model allowed YC to efficiently allocate resources, including mentorship and expert talks, to a group of startups simultaneously, maximizing the impact of their support.

4. **Increased Deal Flow**: By creating a structured program that encouraged the formation of new startups, YC effectively increased the number of startups being founded, which contributed to a vibrant ecosystem and more opportunities for investment.

5. **Practice for Investors**: The batch approach provided YC partners with the opportunity to practice being investors, gaining experience and insights that would benefit both the firm and the startups.

6. **Visibility and Credibility**: The regular influx of startups in batches helped establish YC's reputation as a leading accelerator, attracting more applicants and increasing its visibility in the startup ecosystem.

Overall, the batch model not only streamlined the investment process but also created a supportive environment that significantly enhanced the chances of success for the startups involved.

## Generate the dataset for evaluation

### Load the knowledge base

In [20]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = DirectoryLoader("./docs/paul_graham/")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=20)

documents = loader.load_and_split(text_splitter)