In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### **Data Loading**

In [2]:
# Extract text from pdf 
def load_pdf_files(data):
    loader = DirectoryLoader(
        data, 
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )

    documents = loader.load() 
    return documents

In [3]:
extracted_data = load_pdf_files("C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data")

Ignoring wrong pointing object 0 0 (offset 0)


In [4]:
len(extracted_data)

859

### **Data Pre-processing**

In [21]:
from typing import List
from langchain.schema import Document

def filter_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of document objects, Return a new list of document objects
    containing only 'source' in metadata and the original page content
    """
    filtered_docs: List[Document] = [] 
    for doc in docs:
        src = doc.metadata.get("source")
        filtered_docs.append(
            Document(
                page_content=doc.page_content, 
                metadata = {"source": src}
            )
        )
    return filtered_docs


In [22]:
filtered_docs = filter_docs(extracted_data)

In [26]:
filtered_docs[:5]

[Document(metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content=''),
 Document(metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content='Encyclopedia of Finance'),
 Document(metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content='The Editors\nCheng-Few Lee,Rutgers University, USA\nAlice C. Lee,San Franscisco State University, USA\nADVISORY BOARD\nJames R. Barth,Auburn University and Milken Institute, USA\nIvan Brick,Rutgers University, USA\nWayne Ferson,Boston College, USA\nJoseph E. Finnerty,Universty of Illinois, USA\nMartin J. Gruber,New York University, USA\nGeorge Kaufman,Layola University, USA\nJohn Kose,New York University, USA\nRobert A. Schwartz,City University of New York, USA'),
 Document(metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content='Encyclopedia of 

### **Text Splitting and Chunking**

In [27]:
# Creating chunks 
def text_split(filtered_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500, 
        chunk_overlap = 50
    )
    text_chunks = text_splitter.split_documents(filtered_docs)
    return text_chunks

In [28]:
text_chunks = text_split(filtered_docs)

In [29]:
len(text_chunks)

6595

### **Embedding & RAG Creation**

In [81]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from dotenv import load_dotenv 
import os 

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [82]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [88]:
emb = embedding_model.embed_query("Hello world")

In [86]:
from pinecone import Pinecone
pinecone_api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(
    api_key=pinecone_api_key
)

In [96]:
from pinecone import ServerlessSpec

index_name = "finance-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name, 
        dimension=1536, # Dim of the embedding 
        metric="cosine", # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [97]:
index = pc.Index(index_name)

In [98]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks, 
    embedding = embedding_model, 
    index_name = index_name
)

In [99]:
docsearch_existing = PineconeVectorStore.from_existing_index(
    index_name= index_name, 
    embedding=embedding_model
)

In [105]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {'k': 3})

In [106]:
retriever.invoke("What is finance")

[Document(id='bab4fef4-cbd3-4c57-8142-eac04ed648e8', metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content='Encyclopedia of Finance'),
 Document(id='c31ef2b5-a48d-47db-be7c-a23997fe72a3', metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_content='flows.\n33. Finance Charge\nAs defined by truth-in-lending Regulation Z the\nfinance charge refers to ‘‘all charges payable dir-\nectly or indirectly by the borrower and imposed\ndirectly or indirectly by the lender as an incident to\nor as an extension of credit.’’\n34. Finance Company\nA firm that borrows from the money and capital\nmarkets to make loans to individuals and commer-\ncial enterprises. The services provided by finance\ncompanies include consumer lending, business'),
 Document(id='74349d70-6267-4e96-9536-4eb6530d83aa', metadata={'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\Bot\\data\\Encyclopedia_finance.pdf'}, page_con

In [107]:
chat_model = ChatOpenAI(model="gpt-4o-mini")

In [124]:
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are a knowledgeable and reliable Financial Assistant designed to help users "
    "with finance-related questions. Use the provided context to generate accurate, "
    "clear, and concise answers. Your response should be based strictly on the information "
    "in the context. If the answer is not available in the context, say that you don't know — "
    "do not make up answers.\n\n"
    "Respond in a professional tone suitable for investors, analysts, and business users. "
    "Limit your response to three sentences.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [125]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [126]:
question_answering_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [131]:
response = rag_chain.invoke({"input": "How does the concept of systematic and unsystematic risk influence portfolio diversification, and what role does the CAPM model play in this context?"})
print(response["answer"])

Systematic risk affects all investments in the market and cannot be eliminated through diversification, while unsystematic risk pertains to individual assets and can be mitigated by holding a diversified portfolio. Investors aiming to achieve complete diversification should focus on holding the market portfolio, which reflects systematic risks. The Capital Asset Pricing Model (CAPM) provides a framework to measure an asset's systematic risk by comparing its returns to those of the market portfolio, supporting the idea that optimal portfolios are established at the tangency point in mean-standard deviation space.


In [132]:
response = rag_chain.invoke({"input": "How does the Modigliani-Miller theorem challenge traditional views on capital structure, and how do taxes and bankruptcy costs modify its conclusions in real markets?"})
print(response["answer"])

The Modigliani-Miller theorem posits that in a world without taxes and bankruptcy costs, a firm's capital structure does not affect its overall value. However, when taxes are introduced, the proposition is modified to incorporate the tax shield value of debt, thereby indicating that leverage can enhance firm value due to the tax deductibility of interest (as per equation VL = VU + (T)(D)). Additionally, in real markets, the existence of bankruptcy costs can further negate the benefits of high leverage, altering the optimal capital structure and emphasizing the trade-off between tax advantages and potential financial distress.


In [135]:
response = rag_chain.invoke({"input": "What are the key differences between passive and active investment strategies, and how does the Efficient Market Hypothesis support or challenge each?"})
print(response["answer"])

Passive investment strategies involve buying and holding a well-diversified portfolio that represents a broad-based market index without attempting to identify mispriced securities. In contrast, active investment strategies seek to outperform the market by making specific security selections based on market analysis and predictions. The Efficient Market Hypothesis (EMH) supports passive strategies by suggesting that all available information is already reflected in stock prices, making it difficult for investors with private information to consistently outperform a passive investment approach.


In [136]:
response = rag_chain.invoke({"input": "How to loose weight fast"})
print(response["answer"])

I don't know.
