In [None]:
# 1. Set up your environment

In [None]:
!pip install -qU \
  pinecone-client[grpc]==3.0.0 \
  pinecone-datasets==0.7.0 \
  langchain-pinecone==0.0.3 \
  langchain-openai==0.0.7 \
  langchain==0.1.9


In [None]:
from dotenv import load_dotenv
load_dotenv()

import os
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')


In [None]:
#2. Build the knowledge base

In [None]:
# from pinecone_datasets import list_datasets

# list_datasets(as_df=True)

In [None]:
import pinecone_datasets  
dataset = pinecone_datasets.load_dataset('wikipedia-simple-text-embedding-ada-002-100K')  
len(dataset)  

# Response:
# 100000


In [None]:
#Reduce the dataset and format it for upserting into Pinecone:

# we will use rows of the dataset up to index 30_000
dataset.documents.drop(dataset.documents.index[30_000:], inplace=True)
# we drop sparse_values as they are not needed for this example  
dataset.documents.drop(['metadata'], axis=1, inplace=True)  
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)  


In [None]:
#3. Index the data in Pinecone

In [None]:
#Decide whether to use a serverless or pod-based index.
use_serverless = True  

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, PodSpec  
import time  
# configure client  
pc = Pinecone(api_key=pinecone_api_key)  
if use_serverless:  
    spec = ServerlessSpec(cloud='aws', region='us-east-1')  
else:  
    # if not using a starter index, you should specify a pod_type too  
    spec = PodSpec()  
# check for and delete index if already exists  
index_name = 'langchain-retrieval-augmentation-fast'  
if index_name in pc.list_indexes().names():  
    pc.delete_index(index_name)  
# create a new index  
pc.create_index(  
    index_name,  
    dimension=1536,  # dimensionality of text-embedding-ada-002  
    metric='dotproduct',  
    spec=spec  
)  
# wait for index to be initialized  
while not pc.describe_index(index_name).status['ready']:  
    time.sleep(1)  


In [None]:
index = pc.Index(index_name)  
index.describe_index_stats()  

# Response:
# {'dimension': 1536,  
# 'index_fullness': 0.0,  
# 'namespaces': {},  
# 'total_vector_count': 0}  



In [None]:
for batch in dataset.iter_documents(batch_size=100):  
    index.upsert(batch)  

index.describe_index_stats()  

# Response:
# {'dimension': 1536,  
# 'index_fullness': 0.0,  
# 'namespaces': {},  
# 'total_vector_count': 70000} 


In [None]:
#4. Initialize a LangChain vector store
from langchain_openai import OpenAIEmbeddings  
# get openai api key from platform.openai.com  
model_name = 'text-embedding-ada-002'  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=openai_api_key  
)  



In [None]:
from langchain_pinecone import PineconeVectorStore  
text_field = "text"  
vectorstore = PineconeVectorStore(  
    index, embeddings, text_field  
)  

In [None]:
query = "who was Benito Mussolini?"  
vectorstore.similarity_search(  
    query,  # our search query  
    k=3  # return 3 most relevant docs  
)  

# Response:
# [Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist...', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini', 'title': 'Benito Mussolini', 'wiki-id': '6754'}),  
# Document(page_content='Fascism as practiced by Mussolini\nMussolini\'s form of Fascism, "Italian Fascism"- unlike Nazism, the racist ideology...', metadata={'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini', 'title': 'Benito Mussolini', 'wiki-id': '6754'}),  
# Document(page_content='Veneto was made part of Italy in 1866 after a war with Austria. Italian soldiers won Latium in 1870. That was when...', metadata={'chunk': 5.0, 'source': 'https://simple.wikipedia.org/wiki/Italy', 'title': 'Italy', 'wiki-id': '363'})]


In [None]:
#5. Use Pinecone and LangChain for RAG

In [None]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  
# completion llm  
llm = ChatOpenAI(  
    openai_api_key=openai_api_key,  
    model_name='gpt-3.5-turbo',  
    temperature=0.0  
)  
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  
qa.run(query)  

# Response:
# Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...


In [None]:
from langchain.chains import RetrievalQAWithSourcesChain  
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  
qa_with_sources(query)

# Response:
# {'question': 'who was Benito Mussolini?',  
# 'answer': "Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...",  
# 'sources': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini'}  


In [None]:
pc.delete_index(index_name)
