In [1]:
import os
from openai import OpenAI
from pdf2image import convert_from_path
from dotenv import load_dotenv
import base64
import json
import ssl
import certifi
from pymongo import MongoClient

In [2]:
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [5]:
def pdf_to_images(pdf_path, dpi=200, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # Convert PDF pages to a list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)
    image_files = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        image_files.append(image_path)
    return image_files
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
def extract_text_from_pdf(pdf_path):
    # Convert PDF to images
    images_array = pdf_to_images(pdf_path)
    responses = []

    # Process each image
    for image_path in images_array:
        # Encode the image
        base64_image = encode_image(image_path)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o",  # Correct model name
                messages=[
                    {
                        "role": "system",
                        "content": """You are a tax expert. You will be provided with a document image, and your task is to extract all the text from it. 
                        Please don't add any additional information. Also only extract information from documents which are in the form of tax documents/bank statments etc instead of just plain text.
                        Also I want you to process the output in the form of a json schema with as many fields as possible with values. 
                        There is no defined schema you need to extract as much info as you can in a json schema."""
                    },
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Extract all the info from this and give me back a json and not a string"},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=1000
            )

            # Extract content from the response object correctly
            responses.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            
    # Clean up temporary image files
    for image_path in images_array:
        try:
            os.remove(image_path)
        except Exception as e:
            print(f"Error removing temporary file {image_path}: {str(e)}")
            
    # Join text from all pages
    extracted_text = "\n\n".join(responses)
    return extracted_text


In [6]:
combined_json = {}

In [9]:
pdf_path = "/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/extraction/bank_statement.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
data = json.loads(extracted_text[7:len(extracted_text)-3])
combined_json.update(data)

In [10]:
combined_json

{'employeeSocialSecurityNumber': 'XXX-XX-8758',
 'employerIdentificationNumber': '52-6002033',
 'employer': {'name': 'State of Maryland Central Payroll Bureau',
  'address': 'P.O. Box 2396',
  'city': 'Annapolis',
  'state': 'MD',
  'zipCode': '21404-2396',
  'S.S.#': '69-0520001L'},
 'controlNumber': 'MD126640',
 'employee': {'name': 'Ritesh R Thipparthi',
  'address': '4243 Valley Drive',
  'apartmentNumber': '2208',
  'city': 'College Park',
  'state': 'MD',
  'zipCode': '20742'},
 'wagesTipsOtherCompensation': 6232.85,
 'federalIncomeTaxWithheld': 28.89,
 'state': {'state': 'MD',
  'employerStateIDNumber': '52-6002033',
  'stateWagesTipsEtc': 6232.85,
  'stateIncomeTax': 275.94},
 'formType': 'W-2 Wage and Tax Statement',
 'year': 2024,
 'note': 'REISSUED STATEMENT',
 'bank': 'Commerce Bank',
 'address': '1000 Walnut, Kansas City MO 64106-3686',
 'member_fdic': True,
 'customer_name': 'Jane Customer',
 'customer_address': '1234 Anywhere Dr., Small Town, MO 12345-6789',
 'primary_ac

In [12]:
def extract_insights(json_data):
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": """You are a helpful finance/tax assistant who recieves json formatted data.
         The data will be related to finance/tax and your job is to provide me with the data that I can use to perfrom some nice visualizations
         give me the result in the form of 2 lists on which I can peform some data analysis/viz"""},
        {
            "role": "user",
            "content": str(combined_json)
        }
    ]
    )
    return completion.choices[0].message.content
    
print(extract_insights(combined_json))

```json
{
    "year": 2024,
    "employee": {
        "name": "Ritesh R Thipparthi",
        "address": "4243 Valley Drive",
        "apartmentNumber": "2208",
        "city": "College Park",
        "state": "MD",
        "zipCode": "20742"
    },
    "employer": {
        "name": "State of Maryland Central Payroll Bureau",
        "city": "Annapolis",
        "state": "MD",
        "zipCode": "21404-2396"
    },
    "income_and_tax_details": {
        "wagesTipsOtherCompensation": 6232.85,
        "federalIncomeTaxWithheld": 28.89,
        "stateIncome": {
            "stateWagesTipsEtc": 6232.85,
            "stateIncomeTax": 275.94
        }
    },
    "bank_statement": {
        "statement_date": "June 5, 2003",
        "beginning_balance": 7126.11,
        "ending_balance": 10521.19,
        "transactions": [
            {
                "type": "deposit",
                "date": "05-15",
                "amount": 3615.08
            },
            {
                "type": "ATM

In [None]:
upload_folder = '/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/uploads'
all_data = {}
pdf_files = [f for f in os.listdir(upload_folder) if f.endswith('.pdf')]
print(pdf_files)