In [15]:
import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import os

In [16]:
from dotenv import load_dotenv
load_dotenv()

True

## Reading documents

In [17]:
def read_documents_from_dir(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [18]:
docs = read_documents_from_dir('pdfs/')
len(docs)

1

## Chunking documents

In [19]:
def convert_docs_to_chunks(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs

docs = convert_docs_to_chunks(docs)

## Embedding LLM

In [20]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(api_key = os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x12c856690>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x12c849550>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None)

In [21]:
test_vectors = embeddings.embed_query("Who is hitler?")
len(test_vectors)

1536

## Vector DB

In [22]:
from langchain_pinecone import PineconeVectorStore

index_name = "sample-index"

pinecone = PineconeVectorStore.from_documents(
    docs, embeddings, index_name=index_name
)

## Query LLM

In [23]:
def retrieve_query(query,k=2):
    matching_results = pinecone.similarity_search(query,k=k)
    return matching_results

In [24]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
retriever = pinecone.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Example

In [33]:
rag_chain.invoke("total of my units?")

'The total of your units in the PMMF UITF is 206.1112, with a corresponding amount of 358,037.74 PHP as of the latest transaction date provided in the document.'