In [None]:
!pip install langchain langchain-openai openai pinecone-client tiktoken unstructured -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.9/457.9 kB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
import shutil

# Extracting essays from zip file to a folder
shutil.unpack_archive(
        filename="/content/PaulGraham.zip", extract_dir="/content/", format="zip"
    )

In [None]:
from langchain_community.document_loaders import DirectoryLoader

# Loading essay files using unstructured.io loader
essays = DirectoryLoader("/content/PaulGraham").load()

len(essays)

212

In [None]:
# Average Length of essay files, or each Document object.

sum([len(essay.page_content) for essay in essays])//len(essays)

13631

In [None]:
# max Length of an essay file, or each Document object.

max([len(essay.page_content) for essay in essays])

74890

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# To recursively separate firstly with paragraph,
# if there's no paragraph split it looks for a next line,
# if no line split exists, it looks for a space " "
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=3500, # splits after every 3500 characters.
)

splitted_essays = text_splitter.split_documents(essays)

len(splitted_essays)

1060

In [None]:
from getpass import getpass

OPENAI_API_KEY=getpass("Enter OpenAI key: ")
PINECONE_API_KEY=getpass("Enter pinecone key: ")

Enter OpenAI key: ··········
Enter pinecone key: ··········


In [None]:
import pinecone
from langchain.vectorstores.pinecone import Pinecone
from langchain_openai.embeddings import OpenAIEmbeddings

# Loading our embedding model
embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initializing pinecone client
pc = pinecone.Pinecone(
            api_key=PINECONE_API_KEY, pool_threads=20
        )

index = pc.Index("essays")

In [None]:
from itertools import islice
from typing import Iterable, Iterator, List, TypeVar

T = TypeVar("T")

def batch_iterate(size: int, iterable: Iterable[T]) -> Iterator[List[T]]:
    """Utility batching function."""
    it = iter(iterable)
    while True:
        chunk = list(islice(it, size))
        if not chunk:
            return
        yield chunk

embedding_chunk_size = 1500
batch_size = 70 # size of batch (length of chunks in one go.)

texts = [chunk.page_content for chunk in splitted_essays]
metadata_list = [chunk.metadata for chunk in splitted_essays]
ids = [str(i) for i in range(len(texts))]

for metadata, text in zip(metadata_list, texts):
    metadata["text"] = text

for i in range(0, len(texts), embedding_chunk_size):
    chunk_texts = texts[i : i + embedding_chunk_size]
    chunk_ids = ids[i : i + embedding_chunk_size]
    chunk_metadatas = metadata_list[i : i + embedding_chunk_size]
    embeddings = embedding_model.embed_documents(chunk_texts)

    # uploading asynchronously
    async_res = [
        index.upsert(
            vectors=batch,
            async_req=True,
        )
        for batch in batch_iterate(
            batch_size, zip(chunk_ids, embeddings, chunk_metadatas)
        )
    ]
    [res.get() for res in async_res]

In [None]:
def query_pinecone(query: str, top_k: int = 4) -> dict:
    """
    Query the Pinecone index with the given parameters.

    Parameters:
    query: The query string to be embedded and searched.
    top_k: The number of top results to return.
    :return: The query responses from Pinecone.
    """
    embedded = embedding_model.embed_query(query)

    try:
        # Query the Pinecone index
        responses = index.query(
            vector=embedded,
            top_k=top_k,
            include_metadata=True,
        )
    except Exception as e:
        # Handle query errors
        print(f"Error during Pinecone query: {e}")
        return {}

    return "\n\n".join([text.metadata["text"] for text in responses["matches"]])

'Written by Paul Graham\n\nFive Founders\n\nApril 2009\n\nInc recently asked me who I thought were the 5 most interesting startup founders of the last 30 years. How do you decide who\'s the most interesting? The best test seemed to be influence: who are the 5 who\'ve influenced me most? Who do I use as examples when I\'m talking to companies we fund? Who do I find myself quoting?1. Steve JobsI\'d guess Steve is the most influential founder not just for me but for most people you could ask. A lot of startup culture is Apple culture. He was the original young founder. And while the concept of "insanely great" already existed in the arts, it was a novel idea to introduce into a company in the 1980s. More remarkable still, he\'s stayed interesting for 30 years. People await new Apple products the way they\'d await new books by a popular novelist. Steve may not literally design them, but they wouldn\'t happen if he weren\'t CEO. Steve is clever and driven, but so are a lot of people in the 

In [None]:
from langchain.prompts import PromptTemplate

# Setting up prompt template which
RETRIEVAL_TEMPLATE = """Given content from Paul grahams essays, answer the question accordingly, \
if you cannot answer using the information below just say "I don't know".

{summaries}

Question: {question}
Helpful Answer:"""

RETRIEVAL_PROMPT = PromptTemplate(
    template=RETRIEVAL_TEMPLATE, input_variables=["summaries", "question"]
)

In [None]:
from langchain_community.chat_models import ChatOpenAI
from langchain.chains.summarize import LLMChain

llm = ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=0,
        openai_api_key=OPENAI_API_KEY,
    )

chain = LLMChain(llm=llm, prompt=RETRIEVAL_PROMPT, verbose=False)

In [None]:
question = "tell me the name of the five founders paul graham mentioned in his essays"
contents = query_pinecone(query=question, top_k=5)
print(chain.run(question = question, summaries=contents))

The five founders mentioned by Paul Graham in his essays are:
1. Steve Jobs
2. TJ Rodgers
3. Larry & Sergey (co-founders of Google)
4. Paul Buchheit
5. Sam Altman
