## Import libraries

In [5]:
import nest_asyncio
import qdrant_client

from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.vector_stores.qdrant import QdrantVectorStore

from IPython.display import Markdown, display
from dotenv import load_dotenv
import os

## Setup Asyncio

In [6]:
import nest_asyncio

nest_asyncio.apply()

## Define LLM, Embedding model and re-ranker model

In [7]:
llm = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model = "gpt-4o-mini",
    request_timeout=120
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

rerank = SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=2)

In [8]:
Settings.embed_model = embed_model
Settings.llm = llm

## Read the documents

In [9]:
input_dir_path = './docs/paul_graham'

loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".txt"],
            recursive=True
        )
docs = loader.load_data()

## Set up the Qdrant vector database

In [11]:
client = qdrant_client.QdrantClient(host="localhost", port=6333)
vector_store = QdrantVectorStore(client=client,
                                 collection_name="document_chat")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(docs,
                                        storage_context=storage_context)

## Define the query engine and prompt template

In [12]:
query_engine = index.as_query_engine(similarity_top_k=4,
                                     node_postprocessors=[rerank])

template = """Context information is below.
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner. Incase 
              you don't know the answer say 'I don't know!'.
              
              Query: {query_str}
              
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

## Query the document

In [13]:
response = query_engine.query("""How did the structure of funding startups 
                                 in batches contribute to the success and 
                                 growth of the Y Combinator program and the
                                 startups involved?""")

In [14]:
display(Markdown(str(response)))

The structure of funding startups in batches contributed to the success and growth of the Y Combinator (YC) program and the startups involved in several ways:

1. **Intensive Support**: By funding multiple startups at once, YC was able to focus intensively on each startup for three months. This concentrated support helped founders refine their ideas, develop their products, and prepare for future funding rounds.

2. **Peer Learning**: Startups in the same batch could learn from each other, share experiences, and collaborate. This created a community where founders could provide feedback and support, fostering a collaborative environment that enhanced innovation.

3. **Efficient Resource Allocation**: The batch model allowed YC to efficiently allocate resources, including mentorship and expert talks, to a group of startups simultaneously, maximizing the impact of their support.

4. **Increased Deal Flow**: By creating a structured program that encouraged the formation of new startups, YC effectively increased the number of startups being founded, which contributed to a vibrant ecosystem and more opportunities for investment.

5. **Practice for Investors**: The batch approach provided YC partners with the opportunity to practice being investors, gaining experience and insights that would benefit both the firm and the startups.

6. **Visibility and Credibility**: The regular influx of startups in batches helped establish YC's reputation as a leading accelerator, attracting more applicants and increasing its visibility in the startup ecosystem.

Overall, the batch model not only streamlined the investment process but also created a supportive environment that significantly enhanced the chances of success for the startups involved.

## Generate the dataset for evaluation

### Load the knowledge base

In [20]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = DirectoryLoader("./docs/paul_graham/")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=20)

documents = loader.load_and_split(text_splitter)

In [23]:
documents[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'source': 'docs/paul_graham/what_i_worked_on.txt'},
  'page_content': 'What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer,

## Setup the models

In [24]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

In [25]:
generator_llm = Ollama(model="phi3:3.8b")
critic_llm = Ollama(model="llama3.2:1b")

ollama_emb = OllamaEmbeddings(
    model="nomic-embed-text",
)

  generator_llm = Ollama(model="phi3:3.8b")
  ollama_emb = OllamaEmbeddings(


## Create Ragas' TestsetGenerator

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=ollama_emb
)

distribution = {simple: 0.5, reasoning: 0.25, multi_context: 0.25}
testset = generator.generate_with_langchain_docs(documents,
                                                 test_size=10,
                                                 distributions=distribution,
                                                 raise_exceptions=False)

test_df = testset.to_pandas().dropna()

In [27]:
## Import the csv file
import pandas as pd
test_df = pd.read_csv("./docs/test_data_paul_graham.csv")
test_df.head()

Unnamed: 0.1,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,How did the shift to publishing on the web cha...,"[""Wow, I thought, there's an audience. If I wr...",The shift to publishing on the web changed the...,simple,[{'source': 'paul_graham/what_i_worked_on.txt'...,True
1,1,"How does criticizing a project as a ""toy"" rese...","[""[9] You can't usually get paid for doing exa...",Criticizing a project as a 'toy' is similar to...,simple,[{'source': 'paul_graham/how_to_do_great_thing...,True
2,2,How did the structure of funding startups in b...,['The deal for startups was based on a combina...,Funding startups in batches allowed for conven...,simple,[{'source': 'paul_graham/what_i_worked_on.txt'...,True
3,3,How can exploring different topics help in gen...,"[""Talking or writing about the things you're i...",Exploring different topics can help in generat...,simple,[{'source': 'paul_graham/how_to_do_great_thing...,True
4,4,How does focusing consistently on something yo...,"[""The way to beat it is to stop occasionally a...",Great work happens by focusing consistently on...,simple,[{'source': 'paul_graham/how_to_do_great_thing...,True


## Evaluate the RAG pipeline

In [28]:
def generate_response(query_engine, question):
    response = query_engine.query(question)
    return {
        "answer": response.response,
        "contexts": [c.node.get_content() for c in response.source_nodes],
    }

In [29]:
from datasets import Dataset
from tqdm.auto import tqdm

test_questions = test_df["question"].values

responses = [generate_response(query_engine, q) for q in tqdm(test_questions)]

dataset_dict = {
    "question": test_questions,
    "answer": [response["answer"] for response in responses],
    "contexts": [response["contexts"] for response in responses],
    "ground_truth": test_df["ground_truth"].values.tolist(),
}

ragas_eval_dataset = Dataset.from_dict(dataset_dict)

100%|██████████| 47/47 [05:07<00:00,  6.55s/it]


In [30]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_correctness,
    context_recall,
    context_precision,
)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


In [31]:
metrics = [faithfulness, answer_correctness,
           context_recall, context_precision]

critic_llm = Ollama(model="llama3.2:1b")

ollama_emb = OllamaEmbeddings(model="nomic-embed-text")

evaluation_result = evaluate(
    llm=critic_llm,
    embeddings=ollama_emb,
    dataset=ragas_eval_dataset,
    metrics=metrics
)

Evaluating:   3%|▎         | 5/188 [48:35<36:44:37, 722.83s/it]Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Evaluating:   3%|▎         | 6/188 [49:57<25:31:36, 504.93s/it]Failed to parse output. Returning None.
Evaluating:   5%|▍         | 9/188 [49:57<10:13:17, 205.57s/it]Failed to parse output. Returning None.
Evaluating:   5%|▌         | 10/188 [1:45:16<44:03:48, 891.17s/it]Failed to parse output. Returning None.
Evaluating:   6%|▌         | 11/188 [2:07:18<48:53:12, 994.31s/it]Failed to parse output. Returning None.
Evaluating:   6%|▋         | 12/188 [2:11:24<39:17:00, 803.53s/it]Failed to parse output. Returning None.
Evaluating:   7%|▋         | 13/188 [2:13:13<30:03:26, 618.33s/it]Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Ret

KeyboardInterrupt: 

In [None]:
eval_scores_df = pd.DataFrame(evaluation_result.scores)
eval_scores_df.to_csv("./docs/evaluation_scores_paul_graham.csv", index=False)