In [1]:
import os
from openai import OpenAI
from pdf2image import convert_from_path
from dotenv import load_dotenv
import base64
import json
import ssl
import certifi
from pymongo import MongoClient


In [2]:
load_dotenv()
uri = os.getenv("MONGO_URI")

In [4]:
mongo_client = MongoClient(
    uri,
    tls=True,
    tlsCAFile=certifi.where()
)
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),)

In [5]:
def pdf_to_images(pdf_path, dpi=200, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # Convert PDF pages to a list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)
    image_files = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        image_files.append(image_path)
    return image_files

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_text_from_pdf(pdf_path):
    # Convert PDF to images
    images_array = pdf_to_images(pdf_path)
    responses = []

    # Process each image
    for image_path in images_array:
        # Encode the image
        base64_image = encode_image(image_path)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o",  # Correct model name
                messages=[
                    {
                        "role": "system",
                        "content": """You are a tax expert. You will be provided with a document image, and your task is to extract all the text from it. 
                        Please don't add any additional information. Also only extract information from documents which are in the form of tax documents/bank statments etc instead of just plain text.
                        Also I want you to process the output in the form of a json schema with as many fields as possible with values. 
                        There is no defined schema you need to extract as much info as you can in a json schema."""
                    },
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Extract all the info from this and give me back a json and not a string"},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=1000
            )

            # Extract content from the response object correctly
            responses.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            
    # Clean up temporary image files
    for image_path in images_array:
        try:
            os.remove(image_path)
        except Exception as e:
            print(f"Error removing temporary file {image_path}: {str(e)}")
            
    # Join text from all pages
    extracted_text = "\n\n".join(responses)
    return extracted_text

In [6]:
pdf_path = "/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/extraction/W2_New.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
data = json.loads(extracted_text[7:len(extracted_text)-3])
# print(extracted_text[7:len(extracted_text)-3])
print(data)

{'employee_social_security_number': 'XXX-XX-8758', 'employer_identification_number': '52-6002033', 'employer_name_address_zip_code': {'name': 'State of Maryland Central Payroll Bureau', 'address': 'P.O. Box 2396', 'city': 'Annapolis', 'state': 'MD', 'zip_code': '21404-2396'}, 'social_security_number_suffix': '69-0520001L', 'control_number': 'MD126640', 'employee_name': 'RITESH R THIPPARTHI', 'employee_address': {'street': '4243 Valley Drive', 'apt_number': '2208', 'city': 'College Park', 'state': 'MD', 'zip_code': '20742'}, 'wages_tips_other_compensation': 6232.85, 'federal_income_tax_withheld': 28.89, 'tax_year': 2024, 'state': {'state_abbreviation': 'MD', 'employer_state_id_number': '52-6002033', 'state_wages_tips_etc': 6232.85, 'state_income_tax': 275.94}, 'form': 'W-2 Wage and Tax Statement', 'status': 'REISSUED STATEMENT'}


INSERTING INTO MONGO DB

In [None]:
db = mongo_client['mydatabase']         # Replace with your actual database name
collection = db['mycollection']         # Replace with your actual collection name
result = collection.insert_one(data)
print("Inserted document with id:", result.inserted_id)

FINDING FROM MONGO DB

In [8]:
db = mongo_client['mydatabase']         # Replace with your actual database name
collection = db['mycollection']     # Replace with your actual collection name
for document in collection.find():
    print(document)

{'_id': ObjectId('67b960ebdada2be8bc02b3d1'), 'employee': {'social_security_number': 'XXX-XX-8758', 'first_name': 'RITESH', 'middle_initial': 'R', 'last_name': 'THIPPARTHI', 'address': {'street': '4243 VALLEY DRIVE', 'unit': '2208', 'city': 'COLLEGE PARK', 'state': 'MD', 'zip_code': '20742'}}, 'employer': {'identification_number': '52-6002033', 'name': 'STATE OF MARYLAND CENTRAL PAYROLL BUREAU', 'address': {'address_line_1': 'P.O. BOX 2396', 'city': 'ANNAPOLIS', 'state': 'MD', 'zip_code': '21404-2396', 's_s': 'S.S.# 69-0520001L'}}, 'details': {'control_number': 'MD126640', 'wages_tips_other_compensation': 6232.85, 'federal_income_tax_withheld': 28.89, 'state': {'state_abbr': 'MD', 'employer_state_id_number': '52-6002033', 'state_wages_tips_etc': 6232.85, 'state_income_tax': 275.94}}, 'form_type': 'W-2', 'tax_year': 2024, 'reissued_statement': True}
