# Basic RAG

This notebook is a fully setup basic RAG app including

- PDF Loader
- Chunking
- Vector Embedding
- Pinecone Vector Database
- Retrieval
- Question/Response

## Pre-requisites

In [None]:
# Create virtual environment
! python -m venv venv
! source venv/bin/activate
! which python #make sure that the end of your path output contains "...venv/bin/python"

In [None]:
# Install all packages
! pip install -r requirements.txt --quiet

## Environment

`(1) Packages`

In [1]:
import os
from dotenv import load_dotenv

# Load all environment variables from .env file
load_dotenv()

# Access the environment variables
langchain_tracing_v2 = os.getenv('LANGCHAIN_TRACING_V2')
langchain_endpoint = os.getenv('LANGCHAIN_ENDPOINT')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')

## LLM
openai_api_key = os.getenv('OPENAI_API_KEY')

## Pinecone Vector Database
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_api_host = os.getenv('PINECONE_API_HOST')
index_name = os.getenv('PINECONE_INDEX_NAME')


`(2) LangSmith`

https://docs.smith.langchain.com/

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = langchain_tracing_v2
os.environ['LANGCHAIN_ENDPOINT'] = langchain_endpoint
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

`(3) API Keys`

In [3]:
os.environ['OPENAI_API_KEY'] = openai_api_key
openai_model = "gpt-3.5-turbo"

#Pinecone keys
os.environ['PINECONE_API_KEY'] = pinecone_api_key
os.environ['PINECONE_API_HOST'] = pinecone_api_host
os.environ['PINECONE_INDEX_NAME'] = index_name

`(4) Pinecone Init`

In [4]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
index = pc.Index(os.environ['PINECONE_INDEX_NAME'])

## Full RAG App (Basic)

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate

#### INDEXING ####

pdf_file_paths = "test/scikit-dataset-transformations-feature-extraction.pdf"
loader = PyPDFLoader(pdf_file_paths)

docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Pinecone.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"), 
    index_name=index_name
)

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# LLM
llm = ChatOpenAI(model_name=openai_model, temperature=0.1)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Question
print(rag_chain.invoke("What is this document about?"))