### **Download required packages**

In [None]:
# Core tools
!pip install --upgrade pip
!pip install langchain langchain-community

# PDF loader (PyMuPDF)
!pip install pymupdf

# Embeddings
!pip install sentence-transformers

# Vector storage
!pip install faiss-cpu

# Perplexity support (through langchain_community)
!pip install langchain-community

# Basic utilities
!pip install pandas requests openai

!pip install tf-keras

!pip install plotly

### **Set Up**

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

In [2]:
from openai import OpenAI
import pandas as pd 
import os, json, random, re

### **Q&A Model**

In [3]:
#USER INPUT
N_QUESTIONS = 1
PROJECT_NAME = "session_1"
INPUT_DIR = "annual_report"

#CONFIG
metadata_dir = os.path.join(PROJECT_NAME, "metadata.json")
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 100
N_PAGE_SUMMARY = 3
API_KEY = ""

###### DELETE THIS ON PRODUCTION #########
#Remove session folder and everything in it
if os.path.exists(PROJECT_NAME) :
    import shutil
    shutil.rmtree(PROJECT_NAME)
    print(f"Folder '{PROJECT_NAME}' deleted.")
##########################################
    
#INITIALIZATION
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")
# Create folder if it doesn't already exist
CHUNKS_DIR = os.path.join(PROJECT_NAME, "chunks")
if not os.path.exists(PROJECT_NAME) :
    os.makedirs(PROJECT_NAME)
    os.makedirs(CHUNKS_DIR)
    print(f"Folder '{PROJECT_NAME}' created.")
else:
    print(f"Folder '{PROJECT_NAME}' already exists.")

Folder 'session_1' created.


In [4]:
def summary_extraction(first_n_pages):
    response = client.chat.completions.create(
    model="sonar",
    temperature=0,
    messages= [{
        "role": "system",
                "content": """
                You are a financial report assistant. 
                I will provide the first few pages of a financial report, and your task is to give a concise, single-sentence summary answering: 
                (1) which company the report is about and 
                (2) what year it covers. 
                Limit the summary to 50 words, with no extra details or formatting.
                """
    },
        {   
            "role": "user",
            "content":  f"""
            The first few pages {first_n_pages}
            Your response: 
            """
            
        },
    ])
    return response.choices[0].message.content.strip()


def generate_questions(summary, chunk):
    response = client.chat.completions.create(
        model="sonar",
        messages=[{
            "role": "system",
            "content": """
            You are a question generator. 
            I will provide a chunk of information along with its PDF context. 
            Your task is to generate one question  with the following requirement
            (1) The question should based solely on the chunk’s content
            (2) The question should include enough context from the summary (company name and year) to make it clear what the question is about.
            (3) Do not add any extra information. 
            (4) If the chunk lacks useful content, respond with an empty string.
            """
        },
        {
            "role": "user",
            "content": f"""
            PDF Summary {summary}. Chunk Text: {chunk}
            Your question:
            """
        }],
    )
    return response.choices[0].message.content.strip()

In [5]:
pdf_list = []

# Save metadata
if os.path.exists(metadata_dir):
    with open(metadata_dir, "r") as f:
        metadata = json.load(f)
else:
    metadata = []
all_filenames = [entry["file_name"] for entry in metadata]

# Loop through PDFs
for file_name in os.listdir(INPUT_DIR):
    if file_name.endswith(".pdf") and file_name not in all_filenames:
        file_path = os.path.join(INPUT_DIR, file_name)
        pdf_list.append(file_path)

        # 1) Load
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()

        # 2) Split
        splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        chunks = splitter.split_documents(documents)

        # 3) Summary
        first_n_pages = "\n".join([doc.page_content for doc in documents[:N_PAGE_SUMMARY]])
        summary = summary_extraction(first_n_pages)
        
        # Log
        word_count = 0
        for doc in documents:
            word_count += doc.page_content.count(' ') 
        chunk_count = len(chunks)
        print(f"Processed {file_path}, {word_count} words and {chunk_count} chunks.")
        
        # 4) Save metadata
        format_name = file_name.split(".")[0]
        metadata.append({
            "file_name": file_name,
            "format_name": format_name,
            "file_path": file_path,
            "chunk_count": chunk_count,
            "total_word_count": word_count,
            "summary": summary,
        })

        # 5) Save chunks
        chunks_text_list = [chunk.page_content for chunk in chunks]
        file_chunks_dir = os.path.join(CHUNKS_DIR, f"{format_name}.json")
        
        # Save all chunks into one JSON file
        with open(file_chunks_dir, "w") as f:
            json.dump(chunks_text_list, f, indent=2)

# Save metadata to file
with open(metadata_dir, "w") as f:
    json.dump(metadata, f, indent=2)

Processed annual_report/2024-amazon-annual-report-10K.pdf, 47161 words and 112 chunks.
Processed annual_report/2024-apple-annual-report-10K.pdf, 57814 words and 134 chunks.
Processed annual_report/2024-google-annual-report-10K.pdf, 51919 words and 123 chunks.
Processed annual_report/2024-netflix-annual-report-10K.pdf, 36885 words and 105 chunks.
Processed annual_report/2024-cisco-full-annual-report.pdf, 71074 words and 175 chunks.
Processed annual_report/2024-meta-full-annual-report.pdf, 3107 words and 11 chunks.


In [6]:
res = {'file_name': [], 
       'question': [], 
       'format_name': [], 
       'file_path': [], 
       'summary': [], 
       'chunk': [], 
       'chunk_id': []}

for data in metadata:
    file_name, format_name, file_path, summary = data["file_name"], data["format_name"], data["file_path"], data["summary"]
    file_chunks_dir = os.path.join(CHUNKS_DIR, f"{format_name}.json")
    chunks = json.load(open(file_chunks_dir, "r"))
    for i in range(N_QUESTIONS):
        #Pick a random chunk
        chunk_id = random.randint(0, len(chunks) - 1)
        chunk = chunks[chunk_id]
        question = generate_questions(summary, chunk)
        #Save to results
        res['file_name'].append(file_name)
        res['question'].append(question)
        res['format_name'].append(format_name)
        res['file_path'].append(file_path)
        res['summary'].append(summary)
        res['chunk'].append(chunk)
        res['chunk_id'].append(chunk_id)
res = pd.DataFrame(res)

In [7]:
res.head()

Unnamed: 0,file_name,question,format_name,file_path,summary,chunk,chunk_id
0,2024-amazon-annual-report-10K.pdf,"What are the key details about Amazon.com, Inc...",2024-amazon-annual-report-10K,annual_report/2024-amazon-annual-report-10K.pdf,"The report is about Amazon.com, Inc. and cover...",Available Information\nOur investor relations ...,17
1,2024-apple-annual-report-10K.pdf,"Based on the 2024 Apple Inc. report, what are ...",2024-apple-annual-report-10K,annual_report/2024-apple-annual-report-10K.pdf,The report is about Apple Inc. and covers the ...,"Discharge, Defeasance and Covenant Defeasance\...",84
2,2024-google-annual-report-10K.pdf,Based on Alphabet Inc.'s 2024 Form 10-K annual...,2024-google-annual-report-10K,annual_report/2024-google-annual-report-10K.pdf,The report is the Form 10-K annual report for ...,and app integration. Gemini for Google Workspa...,6
3,2024-netflix-annual-report-10K.pdf,"Based on Netflix, Inc.'s fiscal year ended Dec...",2024-netflix-annual-report-10K,annual_report/2024-netflix-annual-report-10K.pdf,"The report is about Netflix, Inc. and covers t...","Table of Contents\nYear Ended December 31,\n20...",82
4,2024-cisco-full-annual-report.pdf,What does Cisco’s pro forma financial informat...,2024-cisco-full-annual-report,annual_report/2024-cisco-full-annual-report.pdf,The report is about Cisco and covers the fisca...,Pro forma Financial Information\nThe unaudited...,132


In [8]:
res.to_csv(f"{PROJECT_NAME}/questions.csv", index=False)
res['question']

0    What are the key details about Amazon.com, Inc...
1    Based on the 2024 Apple Inc. report, what are ...
2    Based on Alphabet Inc.'s 2024 Form 10-K annual...
3    Based on Netflix, Inc.'s fiscal year ended Dec...
4    What does Cisco’s pro forma financial informat...
5    What were the revenue, net income, and income ...
Name: question, dtype: object

#### **Create a dummy testset**

In [9]:
# Settings
PDF_FOLDER = "annual_report"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500

# Load all PDFs
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# Split
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = splitter.split_documents(all_documents)

print(f"Split into {len(chunks)} chunks.")

# Embedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}  # False = Euclidean, True = Cosine similarity

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")
print("Vector store saved.")

# Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# LLM setup
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key="",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

print("QA chain is ready.")

Loaded 537 total documents.
Split into 660 chunks.


  hf = HuggingFaceEmbeddings(
2025-08-02 18:08:59.592976: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-02 18:08:59.608121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754176139.622955    1895 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754176139.627700    1895 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754176139.641162    1895 computation_placer.cc:177] computation placer already registered. Please ch

Vector store saved.
QA chain is ready.


  llm = ChatPerplexity(


In [10]:
dummy_test = {'question': [], 'answer': [], 'top_k_chunk': []} #Top 3

for question in res['question']:
    result = qa_chain.invoke(question)
    dummy_test['question'].append(question)
    dummy_test['answer'].append(result['result'])
    dummy_test['top_k_chunk'].append(result['source_documents'])
dummy_test = pd.DataFrame(dummy_test)

In [11]:
dummy_test

Unnamed: 0,question,answer,top_k_chunk
0,"What are the key details about Amazon.com, Inc...","Key executive officers of Amazon.com, Inc. for...",[page_content='Available Information\nOur inve...
1,"Based on the 2024 Apple Inc. report, what are ...",Apple’s legal defeasance and covenant defeasan...,[page_content='DESCRIPTION OF DEBT SECURITIES\...
2,Based on Alphabet Inc.'s 2024 Form 10-K annual...,Gemini technology is integrated within Google ...,[page_content='concepts in a format that is us...
3,"Based on Netflix, Inc.'s fiscal year ended Dec...","During 2023 and 2024, Netflix, Inc. significan...","[page_content='Table of Contents\nNETFLIX, INC..."
4,What does Cisco’s pro forma financial informat...,Cisco’s pro forma financial information for fi...,[page_content='Pro forma Financial Information...
5,"What were the revenue, net income, and income ...","For the fourth quarter ended December 31, 2024...","[page_content='META PLATFORMS, INC.\nCONDENSED..."


In [12]:
# Combine ground truth and generated answer
combined_df = pd.merge(res, dummy_test, on='question', suffixes=('_ground_truth', '_generated'))

In [13]:
combined_df

Unnamed: 0,file_name,question,format_name,file_path,summary,chunk,chunk_id,answer,top_k_chunk
0,2024-amazon-annual-report-10K.pdf,"What are the key details about Amazon.com, Inc...",2024-amazon-annual-report-10K,annual_report/2024-amazon-annual-report-10K.pdf,"The report is about Amazon.com, Inc. and cover...",Available Information\nOur investor relations ...,17,"Key executive officers of Amazon.com, Inc. for...",[page_content='Available Information\nOur inve...
1,2024-apple-annual-report-10K.pdf,"Based on the 2024 Apple Inc. report, what are ...",2024-apple-annual-report-10K,annual_report/2024-apple-annual-report-10K.pdf,The report is about Apple Inc. and covers the ...,"Discharge, Defeasance and Covenant Defeasance\...",84,Apple’s legal defeasance and covenant defeasan...,[page_content='DESCRIPTION OF DEBT SECURITIES\...
2,2024-google-annual-report-10K.pdf,Based on Alphabet Inc.'s 2024 Form 10-K annual...,2024-google-annual-report-10K,annual_report/2024-google-annual-report-10K.pdf,The report is the Form 10-K annual report for ...,and app integration. Gemini for Google Workspa...,6,Gemini technology is integrated within Google ...,[page_content='concepts in a format that is us...
3,2024-netflix-annual-report-10K.pdf,"Based on Netflix, Inc.'s fiscal year ended Dec...",2024-netflix-annual-report-10K,annual_report/2024-netflix-annual-report-10K.pdf,"The report is about Netflix, Inc. and covers t...","Table of Contents\nYear Ended December 31,\n20...",82,"During 2023 and 2024, Netflix, Inc. significan...","[page_content='Table of Contents\nNETFLIX, INC..."
4,2024-cisco-full-annual-report.pdf,What does Cisco’s pro forma financial informat...,2024-cisco-full-annual-report,annual_report/2024-cisco-full-annual-report.pdf,The report is about Cisco and covers the fisca...,Pro forma Financial Information\nThe unaudited...,132,Cisco’s pro forma financial information for fi...,[page_content='Pro forma Financial Information...
5,2024-meta-full-annual-report.pdf,"What were the revenue, net income, and income ...",2024-meta-full-annual-report,annual_report/2024-meta-full-annual-report.pdf,"The report is about Meta Platforms, Inc. and c...","META PLATFORMS, INC.\nCONDENSED CONSOLIDATED S...",5,"For the fourth quarter ended December 31, 2024...","[page_content='META PLATFORMS, INC.\nCONDENSED..."


In [14]:
dummy_test.to_csv(f"{PROJECT_NAME}/dummy_test.csv", index=False)

### **Evaluation Process**

In [15]:
prompt = """
    You are a financial data Q&A evaluator.

    You are given:
    - A **question** generated from a document chunk.
    - The **document chunk** (ground truth source).
    - A **model-generated answer** to the question.

    Your job is to score the model’s answer by carefully comparing it to the document chunk.

    Use the following rubric for each category:

    ---
    **Factual Correctness**
    - 5 = All facts are fully correct and consistent with the chunk.
    - 4 = Minor factual inaccuracies but mostly correct.
    - 3 = Some factual inaccuracies, partly correct.
    - 2 = Major factual mistakes, mostly incorrect.
    - 1 = Completely factually wrong.

    ---
    **Completeness**
    - 5 = Fully answers the question with all key details.
    - 4 = Mostly complete, missing minor details.
    - 3 = Partially complete, missing important parts.
    - 2 = Mostly incomplete, only touches on part of the question.
    - 1 = Completely incomplete.

    ---
    3**Clarity**
    - 5 = Clear, precise, and easy to understand.
    - 4 = Mostly clear, with minor awkwardness.
    - 3 = Understandable but somewhat confusing or vague.
    - 2 = Hard to understand or poorly phrased.
    - 1 = Completely unclear or nonsensical.

    ---
    **Response Format**
    Return ONLY this JSON (no extra explanation):
    {
        "factual_correctness_score": [1-5],
        "completeness_score": [1-5],
        "clarity_score": [1-5],
        "comments": "A brief explanation (1-2 sentences) why you assigned these scores."
    }
"""


def evaluate_answer(question, chunk, answer):
    response = client.chat.completions.create(
        model="sonar",
        messages=[
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": f"""
                Please evaluate the following answer based on the provided question and document chunk. 
                Return ONLY a valid JSON object.

                Question: {question}

                Document Chunk: {chunk}

                Model Answer: {answer}
                """
            }
        ],
    )

    response_content = response.choices[0].message.content.strip()
    print("LLM Raw Output:", response_content)

    # Remove duplicate keys by keeping only the last occurrence
    cleaned_content = re.sub(
        r'(,\s*")(\w+_score)":\s*\d,\s*"\2":\s*\d',
        lambda m: f',{m.group(2)}": {m.group(0).split(":")[-1]}',
        response_content
    )

    result = json.loads(cleaned_content)
    return result

In [16]:
import pandas as pd

# Prepare a list to collect all processed rows
final_rows = []

for _, row in combined_df.iterrows():
    question = row['question']
    chunk = row['chunk']
    answer = row['answer']

    success = False
    while not success:
        try:
            evaluation = evaluate_answer(question, chunk, answer)
            success = True  # Break loop if successful
        except Exception as e:
            print(f"Retrying for question: {question} due to error: {e}")

    # Build a combined result dictionary
    result_row = {
        'question': question,
        'chunk': chunk,
        'answer': answer
    }
    # Add evaluation results
    for key, value in evaluation.items():
        result_row[f'evaluation_{key}'] = value

    final_rows.append(result_row)

# Convert list of results to DataFrame
final_df = pd.DataFrame(final_rows)

# Save to CSV
final_df.to_csv('final.csv', index=False)
print("Saved final results to final.csv")


LLM Raw Output: {
  "factual_correctness_score": 5,
  "completeness_score": 5,
  "clarity_score": 5,
  "comments": "The model answer correctly and comprehensively lists all key executive officers and their roles for fiscal year 2024, accurately reflecting details such as ages, positions, tenure, and prior roles exactly as provided in the document chunk. It is clearly written and logically structured."
}
LLM Raw Output: {
  "factual_correctness_score": 5,
  "completeness_score": 5,
  "clarity_score": 5,
  "comments": "The answer accurately and fully reflects the conditions and implications of Apple's legal defeasance and covenant defeasance options as described in the document chunk, including the requirement of irrevocable deposits, the effect of defeasance on obligations, the issuance of required counsel opinions, and the specific definitions of 'U.S. government obligations' for various Notes series. It is well-structured and clear without factual errors."
}
LLM Raw Output: {
  "factu

In [17]:
final_df.head()

Unnamed: 0,question,chunk,answer,evaluation_factual_correctness_score,evaluation_completeness_score,evaluation_clarity_score,evaluation_comments
0,"What are the key details about Amazon.com, Inc...",Available Information\nOur investor relations ...,"Key executive officers of Amazon.com, Inc. for...",5,5,5,The model answer correctly and comprehensively...
1,"Based on the 2024 Apple Inc. report, what are ...","Discharge, Defeasance and Covenant Defeasance\...",Apple’s legal defeasance and covenant defeasan...,5,5,5,The answer accurately and fully reflects the c...
2,Based on Alphabet Inc.'s 2024 Form 10-K annual...,and app integration. Gemini for Google Workspa...,Gemini technology is integrated within Google ...,5,4,5,The answer accurately describes Gemini's integ...
3,"Based on Netflix, Inc.'s fiscal year ended Dec...","Table of Contents\nYear Ended December 31,\n20...","During 2023 and 2024, Netflix, Inc. significan...",2,2,4,The answer contains significant factual inaccu...
4,What does Cisco’s pro forma financial informat...,Pro forma Financial Information\nThe unaudited...,Cisco’s pro forma financial information for fi...,4,4,5,The answer accurately summarizes the pro forma...


In [18]:
import plotly.express as px
import pandas as pd

# Make sure relevant columns are numeric
numeric_cols = [
    'evaluation_factual_correctness_score', 
    'evaluation_completeness_score', 
    'evaluation_clarity_score'
]

df = final_df.copy()
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN

# Calculate overall average score per question
df['overall_score'] = df[numeric_cols].mean(axis=1)

# Melt DataFrame to long format (including overall score)
melted_df = df.melt(
    id_vars=['question'],
    value_vars=numeric_cols + ['overall_score'],
    var_name='Metric',
    value_name='Score'
)

# Clean up metric names for display
melted_df['Metric'] = (
    melted_df['Metric']
    .str.replace('evaluation_', '', regex=False)
    .str.replace('_score', '', regex=False)
    .str.replace('_', ' ')
    .str.title()
)

# Drop rows with missing scores (optional, if needed)
melted_df = melted_df.dropna(subset=['Score'])

# Plot boxplot
fig = px.box(
    melted_df,
    x='Metric',
    y='Score',
    points='all',  # show individual points
    hover_data=['question'],
    title='Score Distributions per Metric (with Overall Score)',
    height=500
)

fig.update_layout(
    yaxis=dict(range=[0, 6], dtick=1),
    xaxis_title='Metric',
    yaxis_title='Score (1-5)'
)

fig.show()

In [19]:
import plotly.figure_factory as ff

# Select relevant columns
metrics = [
    'evaluation_factual_correctness_score',
    'evaluation_completeness_score',
    'evaluation_clarity_score',
    'overall_score'
]

# Calculate correlation matrix
corr_matrix = df[metrics].corr().round(2)

# Create heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    annotation_text=corr_matrix.values,
    colorscale='Blues',
    showscale=True
)

fig.update_layout(
    title='Correlation Heatmap of Evaluation Metrics',
    width=600,
    height=600
)

fig.show()


In [20]:
#Worst answer by overall
worst_answer = df.loc[df['overall_score'].idxmin()]
print(f"Worst Answer by Overall Score: {df['overall_score'].min()}")
print(f"Question: {worst_answer['question']}")
# print(f"Answer: {worst_answer['answer']}")
print(f"Reasons: {worst_answer['evaluation_comments']}")  

Worst Answer by Overall Score: 2.6666666666666665
Question: Based on Netflix, Inc.'s fiscal year ended December 31, 2024, how did the company's stock repurchase authorizations and activities evolve during 2023 and 2024, and what is the remaining authorization amount as of December 31, 2024?
Reasons: The answer contains significant factual inaccuracies and unsupported details compared to the document chunk. The official document states that in September 2023, Netflix authorized $10 billion for repurchases and in December 2024 increased authorization by an additional $15 billion, with $17.1 billion remaining as of December 31, 2024, not $15 billion total or approximately $1.8 billion remaining. The answer incorrectly references a $5 billion authorization in 2021 and uses the $15 billion figure inaccurately. The repurchase amount in 2024 stated in the chunk is $6.211 billion, not $13.2 billion. Thus, the model answer’s figures and timeline are mostly incorrect. It partially responds to th