# **Rag using Pinecone**

## **Installing the Dependicies**

In [2]:
!pip install langchain langchain_community pinecone langchain_pinecone

Collecting langchain_community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.11-py3-none-any.whl.metadata (6.1 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting langchain-tests<1.0.0,>

In [3]:
!pip install langchain-openai



In [4]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/310.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/310.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.0.0


### **API Keys**

In [None]:
PINECONE_API_KEY= "  "
OPENAI_API_KEY="  "


## **Create PineCone VectorStore**

### **1-Document List**

In [9]:
#loaders
from langchain_community.document_loaders import PyPDFLoader

def load_file(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    return pages


In [8]:
file_path = "/content/National AI Policy Consultation Draft V1.pdf"
pages = load_file(file_path)

In [10]:
#Cleaning

def clean_data(pages):
    cleaned_data = []
    for page in pages:
        text = page.page_content
        cleaned_text = " ".join(text.split())
        page.page_content = cleaned_text
        cleaned_data.append(page)
    return cleaned_data

In [11]:
cleaned_data = clean_data(pages)

In [12]:
#Splitting

from langchain.docstore.document import Document
#Recursive character text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter



re_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 200
)


def splitted_data(cleaned_data):
    doc_list = []
    for page in cleaned_data:
        pg_split = re_splitter.split_text(page.page_content)

        for pg_sub_split in pg_split:
            metadata = {"source": "AI policy", "page_no": page.metadata["page"] + 1}

            doc_string = Document(page_content = pg_sub_split, metadata=metadata)
            doc_list.append(doc_string)
    return doc_list

In [13]:
doc_list = splitted_data(cleaned_data)

In [14]:
doc_list[5].page_content

'1 4.1 1st Pillar: AI Market Enablement ....................................................................................................1 4.1.1 National Artificial Intelligence Fund (NAIF) ................................................................... 1 4.1.2 Center of Excellence in AI & Allied Technologies (CoE-AI) ............................................ 2 4.1.3 Catalyzing Social Development through AI by National Initiatives ............................... 3 4.1.4 Data and Computational Infrastructure ........................................................................ 7 4.2 2nd Pillar: Enabling AI through Awareness & Readiness .................................................................7 4.2.1 Public Awareness of AI'

### **2-Creating Index**

In [15]:
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec

# store API key in variable
PINECONE_API_KEY = PINECONE_API_KEY

# initialize pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# name of the index
index_name = "streamlit"

# check if index exists, if not create
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="sparse",
        dimension=1536,          # dimension must match your embeddings model (e.g.  → 1536)
        metric="cosine",         # similarity metric
        spec=ServerlessSpec(
            cloud="aws",         # cloud provider
            region="us-east-1"   # region
        )
    )
    print(f"Index '{index_name}' created successfully!")
else:
    print(f"Index '{index_name}' already exists.")


Index 'streamlit' already exists.


### **3-Embed Model**

In [16]:
from langchain_openai import OpenAIEmbeddings
# define the embedding model variable
embed_model = "text-embedding-3-small"

# Initialize OpenAI embeddings model
embeddings = OpenAIEmbeddings(model=embed_model, openai_api_key=OPENAI_API_KEY)

### **4-Namespace**

In [17]:
# Name your desired namespace
my_namespace = "ai"

### **Final-- Creating a vectorstore**

In [18]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_ENVIRONMENT"] = "us-east-1"

In [19]:
from langchain_pinecone import PineconeVectorStore
vectorstore = PineconeVectorStore.from_documents(
    doc_list, embeddings, index_name="streamlit"
)

### **Retriever**

In [20]:
# Convert the vector store into a retriever
retriever = vectorstore.as_retriever()

In [50]:
query = "what is ai for students"
docs = retriever.invoke(query)

In [51]:
docs

[Document(id='dbd4c6a0-2825-4845-a5ef-18ed864930be', metadata={'page_no': 38.0, 'source': 'AI policy'}, page_content='at a sectoral scale, half of the respondents are well versed with AI and related technologies, while the other half possess working knowledge. To teach an AI-based curriculum, the survey findings point to including short courses and 6 -12 month -long boot camps/diplomas urgently. Furthermore, bachelor’s/master’s level programs and Ph.D. degrees in AI are also worth considering. V. The significant adaptation challenges towards AI proliferation include digitization of information, digital access, literacy, and digital inclusion by design. Heaps of data are available in the public and private sectors, either difficult to access or not available appropriately (such as in hard copies). In many cases, it has been observed that data digitalization and basic process automation in a confined'),
 Document(id='ec985cc4-db5c-4ec1-a2f6-1c9510b31816', metadata={'page_no': 38.0, 'sour

In [25]:
for doc in docs:
    print(doc.page_content[:300])

after many patients, AI can also assist in monitoring the progress and alert care teams to patients whose progress shows cause for concern to take timely action. CoE-AI shall actively develop and support such initiatives. IV. AI can make it easier for patients to self-manage their conditions. For ex
after many patients, AI can also assist in monitoring the progress and alert care teams to patients whose progress shows cause for concern to take timely action. CoE-AI shall actively develop and support such initiatives. IV. AI can make it easier for patients to self-manage their conditions. For ex
after many patients, AI can also assist in monitoring the progress and alert care teams to patients whose progress shows cause for concern to take timely action. CoE-AI shall actively develop and support such initiatives. IV. AI can make it easier for patients to self-manage their conditions. For ex
after many patients, AI can also assist in monitoring the progress and alert care teams to patien

In [26]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [27]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain



In [28]:
!pip install openai
!pip install openai==0.28.0


Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.101.0
    Uninstalling openai-1.101.0:
      Successfully uninstalled openai-1.101.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.3.32 requires openai<2.0.0,>=1.99.9, but you have openai 0.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [29]:
!pip install -U langchain-openai
from langchain_openai import OpenAI


Collecting openai<2.0.0,>=1.99.9 (from langchain-openai)
  Downloading openai-1.102.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-1.102.0-py3-none-any.whl (812 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.0/812.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.102.0


In [30]:
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_ENVIRONMENT"] = "us-east-1"

llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [31]:
#define vectorstore
from langchain_pinecone import PineconeVectorStore
vectorstore = PineconeVectorStore.from_documents(
    doc_list, embeddings, index_name="streamlit"
)

In [32]:
#vectorstore as a reteriver
retriever = vectorstore.as_retriever()

In [84]:
from langchain_pinecone import PineconeVectorStore
# or from langchain.vectorstores import Pinecone

# For retrieval chains
from langchain.chains import RetrievalQA
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 20})


In [85]:
t = retriever.invoke("what is ai for students")

In [86]:
len(t)

20

In [87]:
t[0].page_content

'at a sectoral scale, half of the respondents are well versed with AI and related technologies, while the other half possess working knowledge. To teach an AI-based curriculum, the survey findings point to including short courses and 6 -12 month -long boot camps/diplomas urgently. Furthermore, bachelor’s/master’s level programs and Ph.D. degrees in AI are also worth considering. V. The significant adaptation challenges towards AI proliferation include digitization of information, digital access, literacy, and digital inclusion by design. Heaps of data are available in the public and private sectors, either difficult to access or not available appropriately (such as in hard copies). In many cases, it has been observed that data digitalization and basic process automation in a confined'

In [88]:
print(type(t))

<class 'list'>


In [90]:
from langchain_openai import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['PINECONE_ENVIRONMENT'] = 'us-east-1'
llm = OpenAI(model_name="gpt-4o-mini", temperature=1.5)
docs = t
chain = load_qa_chain(llm, chain_type="stuff")
query = "What is the role of AI for students?"

response = chain.run(input_documents=docs, question=query)

print(response)


 AI plays numerous roles for students, primarily focusing on personalized learning, skill assessment, and enhancing self-management of educational paths. AI applications can provide personalized tutoring, facilitate adaptive learning, track student progress, and help with tailored assessments to better cater to unique learning needs, abilities and acknowledgments. Additionally, AI aids students in managing their health by providing timely reminders and guidance for self-care and maintaining good physical health. Coupled with educational technologies, AI potentially bridges gaps in the complexity of person-to_learning_system interactions, explores different student groups, while bolstering efficiency and relevancy in achieving informed adaptive skills development helping students be prepared for their extradocs. tape out sessions ahead mixing part scholarly relevance unit contrary busy relation testeclassabc bull gifting optim_parentygroupling engineered-study intelligno working empat-e