In [1]:
# 📦 Step 1: Install required packages
!pip install openai psycopg2-binary sqlalchemy redis pandas faker PyPDF2 tqdm




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# 🛠️ Step 2: Load and preview the data
import pandas as pd
df = pd.read_csv("financial_reports_sample.csv")
df.head()

Unnamed: 0,company,report_type,report_date,section,content
0,Infosys,Annual Report,2025-03-28,Cash Flow,Cash Flow analysis for Infosys from the Annual...
1,Wipro,Annual Report,2024-11-26,Profit,Profit analysis for Wipro from the Annual Repo...
2,Reliance,Financial Statement,2023-10-01,Revenue,Revenue analysis for Reliance from the Financi...
3,L&T,Annual Report,2025-06-06,Liabilities,Liabilities analysis for L&T from the Annual R...
4,HCL,Annual Report,2022-07-26,Revenue,Revenue analysis for HCL from the Annual Repor...


In [3]:
from sqlalchemy import create_engine

# Update the credentials and DB name as needed
engine = create_engine("postgresql+psycopg2://postgres:12345678@localhost:5432/finance_db")


In [4]:
engine

Engine(postgresql+psycopg2://postgres:***@localhost:5432/finance_db)

In [5]:
!pip install dotenv




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import os

In [8]:
# 🧠 Step 4: Set up Pinecone and OpenAI
import openai
openai.api_key = os.getenv('OPENAI_API_KEY')


In [9]:
!pip install --upgrade pinecone




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
pip uninstall pinecone-client -y


Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install pinecone


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
!pip install --upgrade pinecone




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from pinecone import Pinecone

pc = Pinecone(api_key="YOUR_API_KEY")

index_name = "llama-text-embed-v2-index"

index = pc.Index(index_name)

# Because your index is integrated with a hosted embedding model, you provide inputs as text 
# and Pinecone converts them to dense vectors automatically.
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

index.upsert_records(
    namespace="example-namespace",
    records=data
)

ImportError: cannot import name 'Pinecone' from 'pinecone' (unknown location)

In [19]:

index_name = "financial-rag-index"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)
index = pinecone.Index(index_name)

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index',
            dimension=1536,
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [None]:
# ⚙️ Step 5: Embed & upload to Pinecone
from tqdm import tqdm

def get_embedding(text):
    res = openai.Embedding.create(input=[text], model="text-embedding-ada-002")
    return res["data"][0]["embedding"]

def upload_batch(df):
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            embedding = get_embedding(row['content'])
            metadata = {
                "company": row['company'],
                "report_type": row['report_type'],
                "report_date": str(row['report_date']),
                "content": row['content'][:300]
            }
            index.upsert([(str(i), embedding, metadata)])
        except Exception as e:
            print(f"Error at row {i}: {e}")

In [None]:
# 🔍 Step 6: Retrieve context from Pinecone
def query_pinecone(question, company):
    question_vec = get_embedding(question)
    results = index.query(vector=question_vec, top_k=3, include_metadata=True, filter={"company": {"$eq": company}})
    return "\n\n".join([match['metadata']['content'] for match in results['matches']])

In [None]:
# 🤖 Step 7: Generate answer with OpenAI
def query_openai(context, question):
    messages = [{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}]
    response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
    return response.choices[0].message["content"]

In [None]:
# ⚡ Step 8: Redis Caching
import redis
r = redis.Redis(host="localhost", port=6379, decode_responses=True)

def get_cached_answer(company, question):
    key = f"{company}:{question}"
    return r.get(key) if r.exists(key) else None

def set_cached_answer(company, question, answer):
    key = f"{company}:{question}"
    r.setex(key, 3600, answer)

In [None]:
# 🔁 Step 9: End-to-End Financial Query
def financial_query(company, question):
    cached = get_cached_answer(company, question)
    if cached:
        return f"(Cached) {cached}"

    context = query_pinecone(question, company)
    answer = query_openai(context, question)
    set_cached_answer(company, question, answer)
    return answer

In [None]:
# ✅ Step 10: Run a test query
financial_query("TCS", "What was the revenue trend over the last 3 years?")

In [None]:
# 📊 Step 11: Monitor cache hit ratio
cache_hits, cache_misses = 0, 0

def monitored_query(company, question):
    global cache_hits, cache_misses
    cached = get_cached_answer(company, question)
    if cached:
        cache_hits += 1
        return f"(Cached) {cached}"
    else:
        cache_misses += 1
        return financial_query(company, question)

In [None]:
# 📈 Step 12: Show cache stats
print(f"Cache Hits: {cache_hits}, Misses: {cache_misses}, Hit Ratio: {cache_hits / (cache_hits + cache_misses + 1e-6):.2%}")