In [15]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv

import os

### Settings Environment

In [16]:
load_dotenv()

PINECONE_KEY = os.getenv('PINECONE_API_KEY')

In [17]:
pc = Pinecone(api_key=PINECONE_KEY)

In [18]:
pc.list_indexes()

{'indexes': []}

### Create Indexes

In [39]:
index_name = "langchain"
if index_name not in pc.list_indexes().names():
    print("Creating index...")
    pc.create_index(
        name="langchain",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("Creating Done!!")
else:
    print(f'The index: {index_name} is already exists')

Creating index...
Creating Done!!


Searching Indexes

In [20]:
index_name = "langchain"
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

Delete Indexes

In [37]:
index_name = "langchain"
if index_name in pc.list_indexes().names():
    print("Deleting index...")
    pc.delete_index(index_name)
    print("Deleting Done!!")
else:
    print(f"The index: {index_name} isn't already exists")

Deleting index...
Deleting Done!!


### Work with Vectors

Upsert Vectors

In [22]:
import random

vector = [[random.random() for _ in range(1536)] for i in range(5)]

ids = list("abcde")
index_name = "langchain"

index = pc.Index(index_name)
index.upsert(vectors=zip(ids, vector))

upserted_count: 5

Fetch vector

In [23]:
index.fetch(ids=["c", "d"])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

Delete vector

In [24]:
index.delete(ids=["c", "d"])



In [25]:
index.fetch(ids=["c", "d"])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

Query vector

In [26]:
query = [random.random() for _ in range(1536)]
query[:3]

[0.5472030443138011, 0.08824136968375529, 0.17353503121948954]

In [27]:
index.query(
    vector=query,
    top_k=5,
)

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

### Namespace

In [28]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

Insert namespace

In [29]:
import random

vector = [[random.random() for _ in range(1536)] for i in range(5)]

ids = list("xyz")
index_name = "langchain"

index = pc.Index(index_name)
index.upsert(vectors=zip(ids, vector), namespace="first-time")

upserted_count: 3

In [30]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}, 'first-time': {'vector_count': 0}},
 'total_vector_count': 0}

In [31]:
import random

vector = [[random.random() for _ in range(1536)] for i in range(5)]

ids = list("opq")
index_name = "langchain"

index = pc.Index(index_name)
index.upsert(vectors=zip(ids, vector), namespace="second-time")

upserted_count: 3

In [32]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3},
                'first-time': {'vector_count': 0},
                'second-time': {'vector_count': 0}},
 'total_vector_count': 3}

Delete namespace

In [33]:
# Don't define the namespace. It's not working because of wrong namespaces
index.delete(ids=["o"])
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3},
                'first-time': {'vector_count': 0},
                'second-time': {'vector_count': 0}},
 'total_vector_count': 3}

In [34]:
index.delete(ids=["o"], namespace="second-time")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3},
                'first-time': {'vector_count': 3},
                'second-time': {'vector_count': 0}},
 'total_vector_count': 6}

Delete all namespace

In [35]:
index.delete(delete_all=True, namespace="first-time")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3}, 'second-time': {'vector_count': 0}},
 'total_vector_count': 3}

# Create the search similarity

Clear the old index

In [130]:
for name in pc.list_indexes().names():
    if len(name) != 0:
        print(f"Deleting index: {name}")
        pc.delete_index(name=name)
        print("Done!!")
    else:
        print("Don't have any indexes")

Deleting index: medium
Done!!


Read the PDF

In [109]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.nltk import NLTKTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [110]:
loader = PyPDFLoader(f"source/Python libraries take the Billion Row Challenge _ by Thomas Reid _ Level Up Coding.pdf")
pages = loader.load()

In [152]:
def convert_data(pages: list) -> str:
    concat_pages = []
    for page in pages:
        text_splitter = NLTKTextSplitter()
        document = text_splitter.split_text(page.page_content)
        concat_pages.extend(document)

    documents = "\n".join(concat_pages)
    return documents

In [112]:
text_splitter = NLTKTextSplitter(
    chunk_size=200,
    chunk_overlap=20
)

In [113]:
documents = convert_data(pages)
chucks = text_splitter.create_documents([documents])

Created a chunk of size 333, which is longer than the specified 200
Created a chunk of size 267, which is longer than the specified 200
Created a chunk of size 269, which is longer than the specified 200
Created a chunk of size 300, which is longer than the specified 200
Created a chunk of size 222, which is longer than the specified 200
Created a chunk of size 669, which is longer than the specified 200
Created a chunk of size 291, which is longer than the specified 200
Created a chunk of size 257, which is longer than the specified 200
Created a chunk of size 478, which is longer than the specified 200
Created a chunk of size 1115, which is longer than the specified 200
Created a chunk of size 1391, which is longer than the specified 200
Created a chunk of size 280, which is longer than the specified 200
Created a chunk of size 1174, which is longer than the specified 200
Created a chunk of size 243, which is longer than the specified 200
Created a chunk of size 207, which is longer 

Embedding Document

In [115]:
from langchain_community.embeddings import OllamaEmbeddings

In [116]:
embeddings = (
    OllamaEmbeddings(model="llama3")
)

In [121]:
embedding_chucks = embeddings.embed_documents(chucks)

Insert Document to Pinecone

In [124]:
import pinecone
from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [131]:
# Create the index in Pinecone
index_name = "medium"
if index_name not in pc.list_indexes().names():
    print("Creating index...")
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric="cosine",
        spec=pinecone.PodSpec(
            environment="gcp-starter"
        )
    )
    print("Creating Done!!")
else:
    print(f'The index: {index_name} is already exists')

Creating index...
Creating Done!!


In [132]:
# Insert the document
vector_store = Pinecone.from_documents(
    documents=chucks,
    embedding=embeddings,
    index_name=index_name
)

In [159]:
vector_store_retrieved = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

Asking Questions (Similarity Search)

In [153]:
query = "I’ve also previously written about Polars"
response = vector_store.similarity_search(query, k=2)

In [158]:
for page in response:
    print(page.page_content)
    print("-" * 50)

In a bold move that accelerates the pace of
artificial intelligence development, Modular…
Apr 3094 6.6K 124
Hayk Simonyan Level Up Coding
3.3K 20
Thomas Reid AI Advances
326 1
Vishal Rajput AIGuys
3.4K 33
Elshad Karimov Stackademic
305 26/25/24, 2:48 PM Python libraries take the Billion Row Challenge | by Thomas Reid | Level Up Coding
https://medium.com/gitconnected/python-libraries-take-the-billion-row-challenge-7c108611a9ad 11/12
Coding & Development
11 stories·666 savesPredictive Modeling w/
Python
20 stories·1319 saves
Practical Guides to Machine
Learning
10 stories·1586 savesChatGPT
21 stories·688 saves
See more recomme ndations
How DuckDB can be up to 1000x
more efficient than BigQuery?
--------------------------------------------------
Open in app
Search Write
6/25/24, 2:48 PM Python libraries take the Billion Row Challenge | by Thomas Reid | Level Up Coding
https://medium.com/gitconnected/python-libraries-take-the-billion-row-challenge-7c108611a9ad 1/12
Update Jan 4: Wow, this 

In [160]:
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain_community.chat_models.ollama import ChatOllama

llm = ChatOllama(model="llama3", temperature=1)
retriever = vector_store_retrieved.as_retriever(search_type="similarity", search_kwargs={"k": 3})

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [193]:
# query = "What is the tools that is fastest processing the data?"
query = "What the results of processing the data?"
response = chain.invoke(query)

In [194]:
print(response["result"])

According to the text, the results were:

* DuckDB: 0 minutes 23 seconds
* Datatable: 2 minutes 07 seconds
* Polars: 2 minutes 30 seconds
* Modin: 2 minutes 47 seconds
* Pandas: 7 minutes 18 seconds
* Vaex: 7 minutes 59 seconds
