In [25]:
import os
from openai import OpenAI
from pdf2image import convert_from_path
from dotenv import load_dotenv
import base64
import json
import ssl
import certifi
from pymongo import MongoClient

In [26]:
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [27]:
def pdf_to_images(pdf_path, dpi=200, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # Convert PDF pages to a list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)
    image_files = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        image_files.append(image_path)
    return image_files
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
def extract_text_from_pdf(pdf_path):
    # Convert PDF to images
    images_array = pdf_to_images(pdf_path)
    responses = []

    # Process each image
    for image_path in images_array:
        # Encode the image
        base64_image = encode_image(image_path)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o",  # Correct model name
                messages=[
                    {
                        "role": "system",
                        "content": """You are a tax expert. You will be provided with a document image, and your task is to extract all the text from it. 
                        Please don't add any additional information. Also only extract information from documents which are in the form of tax documents/bank statments etc instead of just plain text.
                        Also I want you to process the output in the form of a json schema with as many fields as possible with values. 
                        There is no defined schema you need to extract as much info as you can in a json schema."""
                    },
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Extract all the info from this and give me back a json and not a string and be precise with your numbers"},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=1000
            )

            # Extract content from the response object correctly
            responses.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            
    # Clean up temporary image files
    for image_path in images_array:
        try:
            os.remove(image_path)
        except Exception as e:
            print(f"Error removing temporary file {image_path}: {str(e)}")
            
    # Join text from all pages
    extracted_text = "\n\n".join(responses)
    return extracted_text


In [28]:
combined_json = {}

In [30]:
pdf_path = "/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/extraction/Bank Statement Example Final.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
data = json.loads(extracted_text[7:len(extracted_text)-3])
combined_json.update(data)

In [31]:
combined_json

{'employee_social_security_number': 'XXX-XX-8758',
 'employer_identification_number': '52-6002033',
 'employer_name_address': {'name': 'STATE OF MARYLAND CENTRAL PAYROLL BUREAU',
  'address': 'P.O. BOX 2396',
  'city_state_zip': 'ANNAPOLIS, MD 21404-2396',
  'state_number': 'S.S.# 69-0520001L'},
 'control_number': 'MD126640',
 'employee_name_address': {'name': 'RITESH R THIPPARTHI',
  'address': '4243 VALLEY DRIVE 2208',
  'city_state_zip': 'COLLEGE PARK MD 20742'},
 'wages_tips_other_compensation': 6232.85,
 'federal_income_tax_withheld': 28.89,
 'social_security_wages': 1500,
 'social_security_tax_withheld': 100,
 'medicare_wages_and_tips': 2000,
 'medicare_tax_withheld': 150,
 'state': {'state_code': 'MD',
  'employer_state_id_number': '52-6002033',
  'state_wages': 6232.85,
  'state_income_tax': 275.94},
 'locality': {'local_wages': 1200, 'local_income_tax': 47.88},
 'bank_name': 'YourBank',
 'bank_address': '16 High Street, Anytown, Anyshire YZ99 1XY',
 'customer_name': 'Mr John S

In [46]:
schema ={
    "bar_chart":{
        "description":"A breakdown of taxes like income tax, federal tax, and state tax as floating point numbers.",
        "params":{
            "income_tax":"x",
            "federal_tax":"y",
            "state_tax":"z",
            "medicare_tax":"a",
        }
        
    },
    "line_graph":{
        "description":"A breakdown of net income vs gross income. List of how net income changed over time versus gross income as a floating point number.",
        "params":{
            "net_income":[],
            "gross_income":[]
        }
    },
    
    
}

In [47]:
def extract_insights(json_data):
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": """You are a helpful finance/tax assistant who recieves json formatted data.
        Based on this data {combined_data} that you recieve and the type of graph, fill in the schema that you recieve with values in place of the parameters and give it back in json format. 
        Make sure that all values that u put in the parameters are accurate and from the json"""},
        {
            "role": "user",
            "content": str(schema)
        }
    ]
    )
    return completion.choices[0].message.content
    
print(extract_insights(combined_json))

```json
{
  "schema": {
    "employee_social_security_number": "XXX-XX-8758",
    "employer_identification_number": "52-6002033",
    "employer_name": "STATE OF MARYLAND CENTRAL PAYROLL BUREAU",
    "employer_address": "P.O. BOX 2396, ANNAPOLIS, MD 21404-2396",
    "employer_state_number": "S.S.# 69-0520001L",
    "control_number": "MD126640",
    "employee_name": "RITESH R THIPPARTHI",
    "employee_address": "4243 VALLEY DRIVE 2208, COLLEGE PARK MD 20742",
    "wages_tips_other_compensation": 6232.85,
    "federal_income_tax_withheld": 28.89,
    "social_security_wages": 1500,
    "social_security_tax_withheld": 100,
    "medicare_wages_and_tips": 2000,
    "medicare_tax_withheld": 150,
    "state_code": "MD",
    "employer_state_id_number": "52-6002033",
    "state_wages": 6232.85,
    "state_income_tax": 275.94,
    "local_wages": 1200,
    "local_income_tax": 47.88,
    "bank_name": "YourBank",
    "bank_address": "16 High Street, Anytown, Anyshire YZ99 1XY",
    "customer_name": 

In [None]:
upload_folder = '/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/uploads'
all_data = {}
pdf_files = [f for f in os.listdir(upload_folder) if f.endswith('.pdf')]
print(pdf_files)