In [4]:
# Install necessary libraries
!pip install camelot-py[cv] pandas sentence-transformers faiss-cpu pymupdf




In [5]:
from google.colab import files
import os

# Upload PDF files
uploaded = files.upload()

# Create an input directory and save the uploaded files there
input_dir = 'input_pdfs'
os.makedirs(input_dir, exist_ok=True)

for filename in uploaded.keys():
    with open(os.path.join(input_dir, filename), 'wb') as f:
        f.write(uploaded[filename])


Saving Sample Financial Statement.pdf to Sample Financial Statement (1).pdf


In [6]:
import camelot
import pandas as pd
import fitz  # PyMuPDF
import re

def preprocess_pdf(pdf_path):
    """Preprocess the PDF by removing images."""
    doc = fitz.open(pdf_path)
    for page in doc:
        for image in page.get_images():
            page.delete_image(image)
    doc.save("preprocessed.pdf")

def extract_pnl_data(pdf_path):
    """Extracts tables from a PDF file and structures it into a DataFrame."""
    try:
        # Pre-process the PDF to remove images
        preprocess_pdf(pdf_path)
        preprocessed_path = "preprocessed.pdf"
        tables = camelot.read_pdf(preprocessed_path, flavor='stream', pages='all')

        all_data = []

        for table in tables:
            for row in table.df.values:
                text = ' '.join(str(cell) for cell in row)  # Concatenate cells

                # Use regex to match specific patterns:
                match = re.search(r'(.*?)\s+(\d[\d,.-]*)\s+(\d[\d,.-]*)\s+(\d[\d,.-]*)\s+(\d[\d,.-]*)', text)

                if match:
                    label = match.group(1).strip()
                    values = [match.group(i).strip().replace(',', '') for i in range(2, 6)]
                    all_data.append([label] + values)

        df = pd.DataFrame(all_data, columns=['Item'] + [f'Column_{i}' for i in range(4)])
        return df  # Return the extracted DataFrame

    except Exception as e:
        print(f"Error during extraction: {e}")
        return None  # Return None in case of error


In [7]:

pdf_path = "Sample Financial Statement.pdf"
df = extract_pnl_data(pdf_path)

if df is not None:
    print(df.head())
else:
    print("Extraction failed.")


                        Item Column_0 Column_1 Column_2 Column_3
0                                2024     2023     2024     2023
1    Revenue from operations     2.16    37923    37441   153670
2          Other income, net     2.17     2729      671     4711
3               Total income    40652    38112   158381   149468
4  Employee benefit expenses     2.18    20393    20311    82620


In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

def create_faiss_index(data):
    """Creates a FAISS index for the given data."""
    embeddings = model.encode(data, show_progress_bar=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, embeddings

def query_faiss(index, query, data):
    """Queries the FAISS index and returns the closest match."""
    query_embedding = model.encode([query])
    _, indices = index.search(query_embedding, 1)
    return data[indices[0][0]]

# Example usage with extracted data
data_list = df.apply(lambda x: " ".join(x.dropna()), axis=1).tolist()
index, embeddings = create_faiss_index(data_list)

question = "What is the gross profit for Q3 2024?"
print(query_faiss(index, question, data_list))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Gross profit 11175 11430 46257 44414
