In [13]:
import fitz  # PyMuPDF
from transformers import pipeline
import json

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip() if text else None
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

# Load the QA model
def load_model():
    try:
        return pipeline("question-answering", model="deepset/roberta-base-squad2")
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Function to extract financial data using the model
def extract_financial_data(model, text):
    if not model:
        print("Model is not loaded. Cannot extract financial data.")
        return {}

    questions = [
        "What is the company name?",
        "What is the report date?",
        "What is the profit before tax?"
    ]

    results = {}
    for question in questions:
        try:
            answer = model(question=question, context=text)
            results[question] = answer.get('answer', 'Not found')
        except Exception as e:
            print(f"Error extracting answer for '{question}': {e}")
            results[question] = "Error extracting data"

    return results

# Function to save results to a JSON file
def save_to_json(data, output_path):
    try:
        with open(output_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Results saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")

# Main function
def main(pdf_path, output_path):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("No text extracted from PDF. Exiting.")
        return

    # Load the model
    model = load_model()
    if not model:
        print("Failed to load model. Exiting.")
        return

    # Extract financial data
    financial_data = extract_financial_data(model, text)

    # Save results to JSON file
    save_to_json(financial_data, output_path)

    # Print results for reference
    print(json.dumps(financial_data, indent=4))

# Example usage
if __name__ == "__main__":
    pdf_path = "/content/1_FinancialResults_05022025142214.pdf"
    output_path = "/content/financial_data.json"
    main(pdf_path, output_path)


Device set to use cpu


Results saved to /content/financial_data.json
{
    "What is the company name?": "Eveready Industries India Ltd",
    "What is the report date?": "February 5, 2025",
    "What is the profit before tax?": "\n15.93"
}


In [1]:
!pip install PyMuPDF transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
from google.colab import userdata
api_key = userdata.get('YOUR_HUGGING_FACE_API_TOKEN')