<a href="https://colab.research.google.com/github/JapiKredi/PineconeVectorDB/blob/main/Pinecone_DB_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VectorDB Pinecone

# Pinecone: https://www.pinecone.io/

In [10]:
!pip install langchain
!pip install pinecone-client==2.2.4
!pip install pypdf



In [11]:
!pip install sentence-transformers==2.2.2



In [3]:
!mkdir pdfs

## Extract the text from the pdf

In [12]:
from langchain.document_loaders import PyPDFDirectoryLoader

In [17]:
loader = PyPDFDirectoryLoader("pdfs")

In [18]:
data = loader.load()

In [19]:
data

[Document(page_content='CS391R: Robot Learning (Fall 2021)\nYou Only Look Once (YOLO): Unified, Real-Time Object Detection\n1Presenter: Shivang SinghSept 2nd, 2021', metadata={'source': 'pdfs/yolo.pdf', 'page': 0}),
 Document(page_content='CS391R: Robot Learning (Fall 2021)2Problem Addressed: Object Detection❖Object detection is the problem of both locating ANDclassifying objects ❖Goal of YOLO algorithm is to do object detection both fast ANDwith high accuracy\n“Deep Learning for Vision Systems” (Elgendy)Object Detection vs Classification', metadata={'source': 'pdfs/yolo.pdf', 'page': 1}),
 Document(page_content='CS391R: Robot Learning (Fall 2021)3Importance of Object Detection for Robotics❖Visual modality is very powerful❖Humans are able to detect objects and do perception using just this modality in real time (not needing radar) ❖If we want responsive robot systems that work in real time (without specialized sensors) almost real time vision based object detection can help greatly\nVi

## Chunking the text

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)

In [22]:
len(text_chunks)

29

In [24]:
text_chunks[1]

Document(page_content='CS391R: Robot Learning (Fall 2021)2Problem Addressed: Object Detection❖Object detection is the problem of both locating ANDclassifying objects ❖Goal of YOLO algorithm is to do object detection both fast ANDwith high accuracy\n“Deep Learning for Vision Systems” (Elgendy)Object Detection vs Classification', metadata={'source': 'pdfs/yolo.pdf', 'page': 1})

## Embeddings

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

In [25]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [27]:
query_result = embeddings.embed_query('Hello World')

In [28]:
print(query_result)

[-0.034477271139621735, 0.0310231801122427, 0.006734997965395451, 0.026108959689736366, -0.03936203569173813, -0.16030244529247284, 0.06692398339509964, -0.006441446021199226, -0.0474504791200161, 0.014758843928575516, 0.07087528705596924, 0.055527616292238235, 0.01919332519173622, -0.026251347735524178, -0.01010959129780531, -0.026940450072288513, 0.02230745181441307, -0.022226683795452118, -0.1496926099061966, -0.01749301515519619, 0.007676273118704557, 0.0543522834777832, 0.0032544205896556377, 0.0317259207367897, -0.08462149649858475, -0.029405983164906502, 0.051595594733953476, 0.04812406376004219, -0.0033148264046758413, -0.058279186487197876, 0.04196928068995476, 0.02221069484949112, 0.128188818693161, -0.02233893796801567, -0.011656254529953003, 0.06292837858200073, -0.03287634626030922, -0.09122609347105026, -0.03117532841861248, 0.05269954726099968, 0.04703483358025551, -0.0842030718922615, -0.030056199058890343, -0.02074482850730419, 0.009517843835055828, -0.0037217836361378

# Initialize Pinecone

In [57]:
import os
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'db5d58c9-c63a-4cd7-af7b-22f4f2956652')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [58]:
import pinecone
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "test" # put in the name of your pinecone index here

In [59]:
from langchain.vectorstores import Pinecone

In [60]:
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

## If you already have an index, you can load it like this

In [33]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x7ee8b3d64670>

## Similarity Search

In [34]:
query = "What is yolo?"

In [35]:
docs = docsearch.similarity_search(query, k=3)

In [36]:
docs

[Document(page_content='image each time leading to less false positives (has contextual information for detection) YOLO algorithm'),
 Document(page_content='image each time leading to less false positives (has contextual information for detection) YOLO algorithm'),
 Document(page_content='CS391R: Robot Learning (Fall 2021)20Discussion of Results❖Pro: YOLO is a lot faster than the other algorithms for image detection❖Pro: YOLO’s use of global information rather than only local information allows it to understand contextual information when doing object detection➢Does better in domains such as artwork due to this❖Con: YOLO lagged behind the SOTA models in object detection➢This is attributed to making many localization errors and unable to detect small object')]

In [38]:
import os

In [39]:
!pip install openai

Collecting openai
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.4 ht

In [41]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('openai_key')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [42]:
from langchain.llms import OpenAI

In [43]:
llm = OpenAI()

  warn_deprecated(


In [44]:
from langchain.chains import RetrievalQA

In [45]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [46]:
query = "What is yolo?"

In [47]:
print(qa.run(query))

  warn_deprecated(


 YOLO (You Only Look Once) is an algorithm that is used for object detection in images. It is known for its speed and ability to use global information for understanding contextual information in object detection, making it effective in domains such as artwork. However, it has been found to lag behind state-of-the-art models due to localization errors and difficulty detecting small objects.
