In [201]:
import os
from openai import OpenAI
from pdf2image import convert_from_path
from dotenv import load_dotenv
import base64
import json
import ssl
import certifi
#from pymongo import MongoClient

In [202]:
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)



In [230]:
from anthropic import Anthropic
import os
import base64
from pdf2image import convert_from_path

def pdf_to_images(pdf_path, dpi=200, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # Convert PDF pages to a list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)
    image_files = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        image_files.append(image_path)
    return image_files

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_text_from_pdf(pdf_path):
    # Convert PDF to images
    images_array = pdf_to_images(pdf_path)
    responses = []
    anthropic = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

    # Process each image
    for image_path in images_array:
        # Encode the image
        base64_image = encode_image(image_path)
        
        try:
            response = anthropic.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1000,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": """You are a tax expert. You will be provided with a document image, and your task is to extract all the text from it. 
                                Please don't add any additional information. Also only extract information from documents which are in the form of tax documents/bank statements etc instead of just plain text.
                                Also I want you to process the output in the form of a json schema with as many fields as possible with values. 
                                There is no defined schema you need to extract as much info as you can in a json schema. Only return the information
                                that exists as a NUMERICAL VALUE. If the value is not a number, then don't include it.
                                BE AS PRECISE AS POSSIBLE.
                                Take your time, the decision is yours, extract all the info CORRECTLY from this and give me back a json and not a string"""
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": base64_image
                                }
                            }
                        ]
                    }
                ]
            )

            # Extract content from the response
            responses.append(response.content[0].text)
            
        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            
    # Clean up temporary image files
    for image_path in images_array:
        try:
            os.remove(image_path)
        except Exception as e:
            print(f"Error removing temporary file {image_path}: {str(e)}")
            
    # Join text from all pages
    extracted_text = "\n\n".join(responses)
    return extracted_text

In [231]:
loda = {}

In [232]:
pdf_path = "/Users/mohak/Desktop/Hacklytics/taxerino/backend/extraction/payslip-1740283884.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
data2 = json.loads(extracted_text)
loda['file1'] = data2


In [233]:
pdf_path = "/Users/mohak/Desktop/Hacklytics/taxerino/backend/extraction/edited_W2 Form.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
data = json.loads(extracted_text)
loda['file2'] = data

In [234]:
loda

{'file1': {'employer_identification_number': 19202020,
  'wages_and_compensation': 94900,
  'federal_income_tax_withheld': 27450,
  'tax_year': 2025},
 'file2': {'employer_identification_number': 19202021,
  'wages_and_compensation': 94900,
  'federal_income_tax_withheld': 37450,
  'tax_year': 2025}}

In [229]:
import os
import json
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def anomaly_detection(json_data):
    '''
    Function to detect anomalies in the data

    checks for common fields in the data provided upon inserting forms on webiste and tallies them to check whether the numbers match
    uses gpt wrapper prevent having to check manually each field
    takes in json data and returns a json object with the anomalies detected
    '''

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": 
                """
                    You are the smartest, most witty, sharp eyed and sharp minded 
                    accountant, with the brain speed of a super computer and expertise far beyond 
                    any human or other computer. Your job is to analyse the given json data containing
                    financial information from the user, which may either just be a singular file, or 
                    multiple. Your job is to check data across similar fields and check if they match/tally up.

                    AS MERELY AN EXAMPLE, if the user has entered 2 financial docs with their salary, and if the 
                    salaries are not the same, then you should raise this anomaly. ANOTHER MERE EXAMPLE MAY BE 
                    if the user has summed up how much tax they filed in a year but the individual tax fields do 
                    not match with the amount filed, then you should raise this anomaly.

                    When it comes to returning the data, it should be in a json format, with the key being the umbrella
                    field that sees the anomaly, and the value should contain a dictionary of as much information 
                    concerning the anomaly as possible.

                    A final example with the proper format is as follows. keep in mind all these values are just examples, 
                    the output must be in a similar format but json file.:
                    {
                        'salary':
                            {
                            "salary_from_file_1": {
                                "salary_1": 100_000,
                                "tax_1": 10_000,
                                "salary_after_tax_1": 90_000
                            },
                            "salary_from_file_2": {
                                "salary_2": 120_000,
                                "tax_2": 10_000,
                                "salary_after_tax_2": 110_000
                            },
                            "salary_anomaly":{
                                "salary_1": 100_000,
                                "salary_2": 120_000,
                                "salary_anomaly": 20_000
                            },
                            "salary_after_tax_anomaly":{
                                "salary_after_tax_1": 90_000,
                                "salary_after_tax_2": 110_000,
                                "salary_after_tax_anomaly": 20_000
                            }
                        },
                        'other_field':
                            {
                                "field_1": "value_1",
                                "field_2": "value_2",
                                "field_anomaly": "value_anomaly"
                            }
                    }
                #NOTE: ONLY RETURN THE FIELDS THAT ACTUALLY HAVE ANOMALIES, NOT ALL FIELDS
                #ANOTHER NOTE: for quantifable numerical data find the differences but for values like EIN that are not quantifable, just return the string "mismatch detected"
                """
            },
            {
                "role": "user",
                "content": str(json_data)
            }
        ]
    )
    output = completion.choices[0].message.content
    return json.loads(output)
anomaly_detection(loda)

{'employer_identification_number': {'ein_from_file_1': 19202020,
  'ein_from_file_2': 192020021,
  'ein_anomaly': 'mismatch detected'},
 'federal_income_tax_withheld': {'tax_withheld_from_file_1': 27450,
  'tax_withheld_from_file_2': 37450,
  'tax_withheld_anomaly': 10000}}

In [8]:
def extract_insights(json_data):
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": """You are a helpful finance/tax assistant who recieves json formatted data.
         The data will be related to finance/tax and your job is to provide me with the data that I can use to perfrom some nice visualizations
         give me the result in the form of 2 lists on which I can peform some data analysis/viz"""},
        {
            "role": "user",
            "content": str(combined_json)
        }
    ]
    )
    return completion.choices[0].message.content
    
print(extract_insights(combined_json))

Based on the provided data, I will create two lists: one for the transaction dates and one for the corresponding transaction amounts. These can be used for further data analysis or visualization.

1. **Transaction Dates:**
   - Deposit: '05-15'
   - ATM Withdrawal: '05-18'
   - Check Paid: '05-12'
   - Check Paid: '05-18'
   - Check Paid: '05-24'

2. **Transaction Amounts:**
   - Deposit: 3615.08
   - ATM Withdrawal: 20.00
   - Check Paid: 75.00
   - Check Paid: 30.00
   - Check Paid: 200.00

These lists show transactions across different days in May, with the corresponding financial amounts. You can use these lists to create plots or graphs to visualize cash flow or expenditure over the timeframe of the bank statement.


In [None]:
upload_folder = '/Users/abhyudaygoyal/Desktop/HACKLYTICS/taxerino/backend/uploads'
all_data = {}
pdf_files = [f for f in os.listdir(upload_folder) if f.endswith('.pdf')]
print(pdf_files)