In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install sentence-transformers
!pip install faiss-gpu
!pip install --upgrade transformers
!pip install pymupdf
!pip install pypdf
!pip install PyPDF2
!pip install arxiv boto3 requests
!pip install langchain beautifulsoup4 langchain-community langchain_openai langchain_nvidia_ai_endpoints

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading to

In [5]:
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import torch
from langchain.vectorstores import FAISS
import faiss
import arxiv
import boto3
import requests
import os

from botocore.exceptions import ClientError
from google.colab import userdata
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("Device:", device)
if device == 'cuda':
    print(torch.cuda.get_device_name(0))

Device: cuda
Tesla T4


## Setting Up environment.

In [6]:
AWS_ACCESS_KEY =  userdata.get('AWS_ACCESS_KEY')
AWS_SECRET_KEY = userdata.get('AWS_SECRET_KEY')

# print(AWS_ACCESS_KEY)

In [7]:
aws_region = 'us-west-2'

s3 = boto3.client('s3',region_name='us-east-2', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY )
dynamodb = boto3.resource('dynamodb',region_name='us-east-2', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
table = dynamodb.Table('paper_store')

## Fetching Required Paper from Dynamodb

In [8]:
# input an index in fetch_and_download_pdfs to download a pdf of the research paper.
download_directory = '/content/drive/MyDrive'
os.makedirs(download_directory, exist_ok=True)


def generate_presigned_url(bucket_name, key, expiration=3600):

    try:
        url = s3.generate_presigned_url(
            'get_object',
            Params={'Bucket': bucket_name, 'Key': key},
            ExpiresIn=expiration
        )
        return url
    except ClientError as e:
        print(f"Error generating presigned URL: {e}")
        return None

def fetch_and_download_pdfs(table_name,idx):

   try:
        response = table.scan()
        items = response.get('Items', [])

        print(f"Fetched required documents from table '{table_name}'. Starting download...\n")

        # for item in items:
        item=items[idx]
        document_id = item.get('DocumentID')
        s3_url = item.get('S3URL')
        metadata = item.get('Metadata', {})

        # Parse bucket name and key from the S3 URL
        if not s3_url:
            print(f"Skipping item {document_id}: No S3URL found.")


        try:

            bucket_name, key = s3_url.split('.s3.')[0].split('//')[1], '/'.join(s3_url.split('/')[3:])


            presigned_url = generate_presigned_url(bucket_name, key)
            if not presigned_url:
                print(f"Failed to generate presigned URL for {document_id}. Skipping.")


            # print(f"Downloading {presigned_url} for DocumentID: {document_id}...")
            response = requests.get(presigned_url)
            response.raise_for_status()

            file_name = metadata.get('FileName', f"{document_id}.pdf")
            local_path = os.path.join(download_directory, file_name)
            with open(local_path, 'wb') as file:
                file.write(response.content)

            print(f"Downloaded and saved: {local_path}")
            return local_path
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {s3_url}: {e}")
        # break                    # remove this break to run in loops.
        print("\nDownloaded.")

   except ClientError as e:
        print(f"Error fetching items from table: {e.response['Error']['Message']}")

index=input("Enter the index of the paper you want to access")
index=int(index)
pdf_path=fetch_and_download_pdfs("paper_store",index)

Enter the index of the paper you want to access59
Fetched required documents from table 'paper_store'. Starting download...

Downloaded and saved: /content/drive/MyDrive/2402.11194v2.Evaluating_LLMs__Mathematical_Reasoning_in_Financial_Document_Question_Answering.pdf


In [9]:
pdf_path

'/content/drive/MyDrive/2402.11194v2.Evaluating_LLMs__Mathematical_Reasoning_in_Financial_Document_Question_Answering.pdf'

## Image Caption + Text Baseline

In [10]:
from PyPDF2 import PdfReader
import fitz
from PIL import Image
from dotenv import load_dotenv
import io
import base64

def extract_images_and_text(pdf_path):
    doc = fitz.open(pdf_path)
    images_and_text = []

    for page in doc:
        text = page.get_text()
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image = Image.open(io.BytesIO(image_bytes))
            image=image.resize((200,200))
            buffered = io.BytesIO()
            image.save(buffered, format="jpeg")

            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')

            words = text.split()
            text_before_image = " ".join(words[:100]) if len(words) > 100 else text

            images_and_text.append({
                "base64_image": img_base64,
                "text": text_before_image
            })

    return images_and_text

# Usage
# pdf_path = "/content/drive/MyDrive/eedp.pdf"
images = extract_images_and_text(pdf_path)
print(images)


[{'base64_image': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCADIAMgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD26o5LiGEqJZo4y3QO4GfpmpKyNb02G+utKklsorgw3WSzxBti+W/r0Gdv44oA1wQQCCCD0IorgtPj8R2NxpNor3htF8rcZEJ5Kxb1OBgKB5mM4A55JAFOvtQ1RtbvLeK51PYrSGRbaMMEALCLy8A9T

In [2]:
if len(images)>0:

  print(images[0]['text'])

NameError: name 'images' is not defined

In [32]:
import requests, base64,os
os.environ['NVIDIA_API_KEY']=userdata.get('NVIDIA_API_KEY')

def caption(image):
    invoke_url = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions"
    stream = True

    # with open("image.png", "rb") as f:
      # image_b64 = base64.b64encode(f.read()).decode()

    assert len(image) < 180_000, \
      "To upload larger images, use the assets API (see docs)"


    headers = {
      "Authorization": f"Bearer {os.environ['NVIDIA_API_KEY']}",
      "Accept": "text/event-stream" if stream else "application/json"
    }

    payload = {
      "model": 'meta/llama-3.2-90b-vision-instruct',
      "messages": [
        {
          "role": "user",
          "content": f'What is in this image? <img src="data:image/png;base64,{image}" />'
        }
      ],
      "max_tokens": 1024,
      "temperature": 1.00,
      "top_p": 1.00,
      "stream": stream
    }

    response = requests.post(invoke_url, headers=headers, json=payload)

    return response


In [33]:
import json
def output_caption(response):
  output = ""
  for line in response.iter_lines():
      if line:
          decoded_line = line.decode("utf-8")
          if decoded_line.startswith("data: "):
              try:
                  json_data = json.loads(decoded_line[len("data: "):])
                  if 'choices' in json_data:
                      data = json_data['choices']
                      if 'delta' in data[0] and 'content' in data[0]['delta']:
                          content = data[0]['delta']['content']
                          output += content
              except json.JSONDecodeError as e:
                  pass

  return output

In [34]:
for i in images:
  response=caption(i["base64_image"])
  while(response.status_code==500):
    response=caption(i["base64_image"])
  response=output_caption(response)
  i['text']+=response

In [35]:
if len(images)>0:

  print(images[2]['text'])

(a) Hierarchical Complexity (b) Empty Cells Figure 7: Sample distribution of Multihiertt Dataset partitioned by (a) hierarchical complexity of the gold evidence. (b) the percentage of empty cells in the supporting table. Specifically, for models with a context length exceeding the input length, we standardized k to 4. For instance, we allocated 2 shots for models like LLaMA and MammoTH due to their constrained context length. However, for other models capable of accommodating larger contexts, we increased the number of shots to 4. Additionally, we used a tempera- ture of 0 and topp = 1 for our experiments. 2. ModelsThe image displays a pie chart with six sections, each representing a different percentage. The chart is divided into six equal parts, with each section labeled with a number from 1 to 6 in the key on the right side of the chart. The percentages are represented by different colors, ranging from dark orange (29.795%) to light blue (7.736%). The background of the chart is whit

In [36]:
# pdf_path = "/content/drive/MyDrive/eedp.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load_and_split()

In [37]:
documents

[Document(metadata={'source': '/content/drive/MyDrive/eedp.pdf', 'page': 0}, page_content='Evaluating LLMs’ Mathematical Reasoning in Financial Document\nQuestion Answering\nPragya Srivastava#∗, Manuj Malik ‡, Vivek Gupta §†, Tanuja Ganu #, Dan Roth §\n#Microsoft Research, ‡Singapore Management University, §University of Pennsylvania\n{t-pragyasri, taganu}@microsoft.com, manujm@smu.edu.sg, {gvivek,danroth}@seas.upenn.edu\nAbstract\nLarge Language Models (LLMs), excel in\nnatural language understanding, but their ca-\npability for complex mathematical reasoning\nwith a hybrid of structured tables and unstruc-\ntured text remain uncertain. This study ex-\nplores LLMs’ mathematical reasoning on four\nfinancial tabular question-answering datasets:\nTATQA, FinQA, ConvFinQA, and Multihiertt.\nThrough extensive experiments with various\nmodels and prompting techniques, we assess\nhow LLMs adapt to complex tables and math-\nematical tasks. We focus on sensitivity to\ntable complexity and perfo

In [38]:
import re
from langchain.schema import Document

all_page_content = "".join([doc.page_content for doc in documents])
all_page_content += "\n".join([doc['text'] for doc in images])

math_expressions = re.findall(r"\$.*?\$", all_page_content)
all_page_content+="\n equations: ".join(math_expressions)
documents = [Document(page_content=all_page_content)]

In [39]:
documents

[Document(metadata={}, page_content='Evaluating LLMs’ Mathematical Reasoning in Financial Document\nQuestion Answering\nPragya Srivastava#∗, Manuj Malik ‡, Vivek Gupta §†, Tanuja Ganu #, Dan Roth §\n#Microsoft Research, ‡Singapore Management University, §University of Pennsylvania\n{t-pragyasri, taganu}@microsoft.com, manujm@smu.edu.sg, {gvivek,danroth}@seas.upenn.edu\nAbstract\nLarge Language Models (LLMs), excel in\nnatural language understanding, but their ca-\npability for complex mathematical reasoning\nwith a hybrid of structured tables and unstruc-\ntured text remain uncertain. This study ex-\nplores LLMs’ mathematical reasoning on four\nfinancial tabular question-answering datasets:\nTATQA, FinQA, ConvFinQA, and Multihiertt.\nThrough extensive experiments with various\nmodels and prompting techniques, we assess\nhow LLMs adapt to complex tables and math-\nematical tasks. We focus on sensitivity to\ntable complexity and performance variations\nwith an increasing number of arithm

## Chunking

In [40]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=20000, chunk_overlap=5000)
chunked_docs = text_splitter.split_documents(documents)

In [41]:
for i, doc in enumerate(chunked_docs):
    print(f"Chunked doc {i+1} length: {len(doc.page_content)}")


Chunked doc 1 length: 19995
Chunked doc 2 length: 19996
Chunked doc 3 length: 19949
Chunked doc 4 length: 19879
Chunked doc 5 length: 17722
Chunked doc 6 length: 6293


In [42]:
# print(chunked_docs[0].page_content)

## Embeddings

In [43]:
os.environ['HF_KEY']=userdata.get('HF_TOKEN')
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
faiss_db = FAISS.from_documents(chunked_docs, embeddings)



## Retriever


In [44]:
retriever = faiss_db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 5}
)


## LLM

In [45]:
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_nvidia_ai_endpoints import ChatNVIDIA

# os.environ["OPENAI_API_KEY"] = getpass.getpass()
# model=ChatOpenAI(model_name="gpt-3.5-turbo")

llm = ChatNVIDIA(
  model="meta/llama-3.1-405b-instruct",

  temperature=0.2,
  top_p=0.7,
  max_tokens=1024,
)

## Retriever + LLM Chain

In [46]:
text_chain = ConversationalRetrievalChain.from_llm(llm, retriever,return_source_documents=True)



## Helper Functions

In [47]:
def image_upload():
  answer = input("Do you want to use an image? (yes/no): ")
  if answer.lower() == "no":
    return None
  uploaded = files.upload()
  for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))
  return fn

In [48]:
import base64

from PIL import Image
import io
def image2base64(image_path):
  img = Image.open(image_path)
  image = img.resize((200, 200))
  buffered = io.BytesIO()
  image.save(buffered, format="PNG")
  img_bytes = buffered.getvalue()
  img_base64 = base64.b64encode(img_bytes).decode('utf-8')
  return img_base64

In [49]:
def ipimage2caption(img_base64):
  response=caption(img_base64)
  while(response.status_code==500):
    response=caption(img_base64)
  response=output_caption(response)
  return response

## RAG


In [57]:
import sys
from google.colab import files
chat_history = []
def get_user_input():
    return input('Prompt: ').lower()

def main(chat_history):
    fn=image_upload()
    # print("Input the prompt")
    query =  get_user_input()
    if(fn!=None):
      image_path=fn
      img_base64=image2base64(image_path)
      response=ipimage2caption(img_base64)
      print("response")
      response="Caption:"+response
      query+='\n'+response
      print(query)

    result = text_chain.invoke({'question': query, 'chat_history': chat_history})
    print(f'Answer: {result["answer"]}\n')
    chat_history.append((query, result['answer']))

if __name__ == "__main__":
    main(chat_history)

Do you want to use an image? (yes/no): yes


Saving image.png to image.png
User uploaded file "image.png" with length 62491 bytes
Prompt: Describe the image
response
describe the image
Caption:**Image Description:**

This image presents a statistical representation of two distinct sets of data, each comprising a collection of numbers. The data is visually organized into two pie charts, which are accompanied by a table titled **"Figure 4: Sample distribution of Multilabel & FnQA datasets portioned by number of rows in the supporting table."**

**Pie Chart 1 (Left):**

The first pie chart, positioned on the left side of the image, is divided into four sections, each representing a different dataset:

*   **57.96%**: This section is colored orange and accounts for the largest proportion of the data.
*   **25.12%**: This section is colored green and represents a significant portion of the data.
*   **16.87%**: This section is colored pink and constitutes a smaller but still notable part of the data.
*   **0.04%**: This section is col