 # Install Numpy and nltk for extracting data from the website langchain needs it

In [5]:
!pip install numpy
!pip install nltk



In [7]:
import sys
import os
import torch
import textwrap
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain_pinecone import PineconeVectorStore


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [8]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/galien/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/galien/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [17]:
URLS = [
    "https://www.ycombinator.com/",
    "https://www.ycombinator.com/companies",
    "https://www.ycombinator.com/jobs",
    "https://www.ycombinator.com/cofounder-matching",
    "https://www.ycombinator.com/library",
    "https://www.ycombinator.com/about",
    "https://www.ycombinator.com/internships",
    "https://www.ycombinator.com/contact",
    "https://www.ycombinator.com/demoday",
    "https://www.ycombinator.com/blog/startup-school",
    "https://www.ycombinator.com/companies/founders",
    "https://www.ycombinator.com/documents"
]

In [63]:
def load_data():
    loader = UnstructuredURLLoader(URLS)
    data = loader.load()
    return data


In [64]:
def  slip_documents(data):
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
    text_chunks = text_splitter.split_documents(data)
    return text_chunks


In [67]:

loaded_data = load_data()
text_chunks = slip_documents(loaded_data)

print(text_chunks)
print(len(text_chunks))
print(loaded_data)
print(len(loaded_data))


[Document(metadata={'source': 'https://www.ycombinator.com/'}, page_content='Y Combinator\nMake something people want.\nApply to YC\n5,000\nfunded startups\n$800B\ncombined valuation\nGarry Tan and Sam Altman at a YC dinner\nTop YC companies\nStripe logo\nAirbnb logo\nInstacart logo\nDoorDash logo\nCruise logo\nTwitch logo\nCoinbase logo\nPagerDuty logo\nFaire logo\nBrex logo\nDeel logo\nRippling logo\nreddit logo\nGusto logo\nFlexport logo\nDropbox logo\nRazorpay logo\nScale AI logo\nGitLab logo\nBenchling logo\nFivetran logo\nRappi logo\nCheckr logo\nZapier logo\nWhatnot logo\nPodium logo\nWebflow logo\nZepto logo\nGroww logo\nSegment logo\nIronclad logo\nWe help foundersmake something people want and the results speak for themselves.\nWe help founders at their earliest stages regardless of their age.\nWe improve the success rate of our startups.\nWe give startups a huge fundraising advantage.\nOur companies have a track record of becoming billion dollar companies.\nOur formula for s

In [23]:
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
import os
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings()

question = "What is the purpose of the Y Combinator?"
embeddings.embed_query(question)

print(len(embeddings.embed_query(question)))

1536


In [33]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "yc-bot-db"


pc.create_index(
    name=index_name,
    dimension=1536, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "yc-bot-db",
    "metric": "cosine",
    "host": "yc-bot-db-ov86ush.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

In [34]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [36]:
# Load Existing index 
vector_store = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [38]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [39]:
retrieved_docs = retriever.invoke("when was the Y Combinator SAFE created?")

In [42]:
retrieved_docs

[Document(id='1cf0673f-bd80-4ffd-b305-6e7e5e9529a8', metadata={'source': 'https://www.ycombinator.com/documents'}, page_content='Needless to say, YC does not assume responsibility for the contents of, or the consequence of using, any version of the safe or any other document found on our website. Before using any of these forms, you should consult with a lawyer licensed in the country where your company was formed.\nFooter\nY Combinator\nY Combinator\nPrograms\nYC Program\nStartup School\nWork at a Startup\nCo-Founder Matching\nCompany\nYC Blog\nContact\nPress\nPeople\nCareers\nPrivacy Policy\nNotice at Collection\nSecurity\nTerms of Use\nResources\nStartup Directory\nStartup Library\nInvestors\nSAFE\nHacker News\nLaunch YC\nYC Deals\nMake something people want.\nApply\nTwitterFacebookInstagramLinkedInYoutube\n© 2025 Y Combinator'),
 Document(id='6dcc51ac-a8c1-4cf8-bc43-b6e32585f97c', metadata={'source': 'https://www.ycombinator.com/about'}, page_content='We invest $125,000 on a post-m

In [None]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [58]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a helpful assistant that can answer questions about the Y Combinator."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the"
    "the answer must be in markdown format"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [59]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [60]:
response = rag_chain.invoke({"input": "who are the World-class founders are on co-founder matching?"})
result = response["answer"]


In [62]:
result

'\n\nSystem: The World-class founders on co-founder matching include Vrinda Haas, Bryant from Harvard, Saba, Curtis and Matthew from Whalesync, Philip and Mathias from AccessOwl, and many others who have successfully met their co-founders and received funding from Y Combinator through the platform.'