In [None]:
import sys
print(sys.executable)

In [None]:
from langchain_astradb import AstraDBVectorStore
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
import pandas as pd

Setting up environment variables for OpenAI and AstraDB

In [None]:
load_dotenv()

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

In [None]:
os.environ["ASTRA_DB_API_ENDPOINT"] = ASTRA_DB_API_ENDPOINT
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = ASTRA_DB_APPLICATION_TOKEN

Created an instance of OpenAIEmbeddings and AstraDBVectorStore.

In [None]:
embedding = OpenAIEmbeddings()

In [None]:
vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name="financebot",
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace="default_keyspace"    
)

Create a list of documents from PDF.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
%pwd

In [None]:
%cd ..

In [None]:
file_path = "data/finance_data.pdf"
loader = PyPDFLoader(file_path)

In [None]:
pages = loader.load()
len(pages)

In [None]:
pages = pages[10:20]

In [None]:
for page in pages:
    print(page.page_content)

In [None]:
raw_text = ""
for i , page in enumerate(pages):
    raw_text += page.page_content

print(raw_text)

Create chunks of text from pages created above

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)



In [None]:
texts = text_splitter.split_text(raw_text)
len(texts)

Number of chunks created: 90

In [None]:
type(texts)

In [None]:
type(texts[0])

Converting string to Document type before pushing to vector store

In [None]:
from langchain.docstore.document import Document

In [None]:
docs = [Document(page_content = text) for text in texts]

In [None]:
docs[0]

Store the vectors in the AstraDB vector store

In [None]:
vstore.add_documents(docs)

Create a retriever using the AstraDB vector store

In [None]:
retriever = vstore.as_retriever(search_kwargs={"k":3}) # k is the number of documents to return

Example of how retriever works

In [None]:
retriever.get_relevant_documents("what is Market For Registrant’s Common Equity?")

Create a prompt template for the LLM model

In [None]:
FINANCE_BOT_TEMPLATE = """" 
    Your finance bot is an expert in finance related advice.
    Ensure your answers are relevant to the query context and refrain from straying off-topic.
    Your responses should be concise and informative.

    CONTEXT:
    {context}

    QUESTION: {question}

    YOUR ANSWER:
"""

In [None]:
from langchain_core.prompts import ChatPromptTemplate

In [None]:
prompt = ChatPromptTemplate.from_template(FINANCE_BOT_TEMPLATE)

Instantiate an LLM model using OpenAI

In [None]:
from langchain_openai import ChatOpenAI

In [None]:
llm = ChatOpenAI()

Create a chain using the prompt and LLM

In [None]:
# RunnablePassthrough is used take query in Runtime and give to LLM
from langchain_core.runnables import RunnablePassthrough

In [None]:
# To get final output from the modle
from langchain_core.output_parsers import StrOutputParser

In [None]:
chain = (
    {"context":retriever ,"question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()    
)

Get a response from the chain via LLM

In [None]:
chain.invoke("what is Market For Registrant’s Common Equity?")