In [1]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "finance"

index = pc.Index(index_name)

# created index already

# pc.create_index(
#     name=index_name,
#     dimension=1536,
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from utils import *
from langchain.schema import Document
# according to docs, Document is a class for storing a piece of text and associated metadata

BATCH_SIZE = 100

load_dotenv()

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

documents = list()

# each Document object has a chunk
for topic_folder in os.listdir("data"):
    vector_store = PineconeVectorStore(
        index=index, embedding=embeddings, namespace=topic_folder
    )
    for file in os.listdir("data/" + topic_folder):
        all_texts = load_text_from_pdf(f"data/{topic_folder}/{file}")
        all_text_chunks = chunk_texts(all_texts)
        documents.extend(
            [
                Document(
                    page_content=chunk,
                    metadata={"source": file, "content": topic_folder},
                )
                for chunk in all_text_chunks
            ]
        )

        for i in range(0, len(documents), BATCH_SIZE):
            batch = documents[i : i + BATCH_SIZE]
            vector_store.add_documents(batch)
            print(
                f"Added batch {i // BATCH_SIZE + 1} of {(len(documents) + BATCH_SIZE - 1) // BATCH_SIZE} for file {file}"
            )


  embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))


Added batch 1 of 4 for file Trade Finance Guide Updated 030421 FINAL.pdf
Added batch 2 of 4 for file Trade Finance Guide Updated 030421 FINAL.pdf
Added batch 3 of 4 for file Trade Finance Guide Updated 030421 FINAL.pdf
Added batch 4 of 4 for file Trade Finance Guide Updated 030421 FINAL.pdf
Added batch 1 of 4 for file why-trade-finance.pdf
Added batch 2 of 4 for file why-trade-finance.pdf
Added batch 3 of 4 for file why-trade-finance.pdf
Added batch 4 of 4 for file why-trade-finance.pdf
Added batch 1 of 5 for file 10 Rules of Investing.pdf
Added batch 2 of 5 for file 10 Rules of Investing.pdf
Added batch 3 of 5 for file 10 Rules of Investing.pdf
Added batch 4 of 5 for file 10 Rules of Investing.pdf
Added batch 5 of 5 for file 10 Rules of Investing.pdf
Added batch 1 of 7 for file Introduction_to_Personal_Investing.pdf
Added batch 2 of 7 for file Introduction_to_Personal_Investing.pdf
Added batch 3 of 7 for file Introduction_to_Personal_Investing.pdf
Added batch 4 of 7 for file Introduct

In [None]:
# # ---- Step 2: Prepare Chunk Texts for Embedding ----
# texts = [doc.page_content for doc in documents]

# # ---- Step 3: Generate Embeddings ----
# vectors = embeddings.embed_documents(texts)

In [None]:
# print(len(vectors))

141


In [None]:
# index = pc.Index(index_name)

# for i, vector in enumerate(vectors):
#     doc = documents[i]
#     vector_id = f"{doc.metadata['source']}_chunk_{i}"

#     metadata = {
#         "source": doc.metadata["source"],
#         "topic": doc.metadata["content"],
#         "text": doc.page_content  # optional: store actual chunk text
#     }

#     index.upsert(vectors=[(vector_id, vector, metadata)])

In [None]:
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
import json


with open("news_sources.json") as f:
    metadata_lookup = json.load(f)


def answer_qn(namespace: str, question: str):

    # Reconnect to Pinecone index
    vectorstore = PineconeVectorStore(
        index=index, embedding=OpenAIEmbeddings(), namespace=namespace
    )


    llm = ChatOpenAI(
        model_name="gpt-4o", temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY")
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True,
    )

    response = qa_chain({"query": question})

    print("Answer:")
    print(response["result"])

    print("\n Sources used:")
    for doc in response["source_documents"]:
        # print(f"- {doc.metadata['source']}")
        print(f"- {metadata_lookup.get(doc.metadata['source'])}")

answer_qn("corporate finance", "teach me about corporate finance")


Answer:
Corporate finance is a field of finance that focuses on the financial activities and decisions of corporations. It involves managing the financial resources of a company to achieve its goals and maximize shareholder value. Here are some key concepts and areas within corporate finance:

1. **Investment Decisions**: This involves deciding which projects or investments a corporation should undertake. It includes evaluating potential projects, analyzing their expected returns, and assessing the risks involved. The goal is to invest in projects that will increase the company's value.

2. **Financing Decisions**: This involves determining how to raise the capital needed to fund the company's investments. Corporations can raise capital through equity (issuing stocks) or debt (borrowing money). The choice between equity and debt financing affects the company's capital structure and cost of capital.

3. **Dividend Policy**: This refers to the decision of how much profit to return to sha