In [None]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
file_path = "C:\\Users\\dk\\Downloads\\IHA Partners Kaiser Invoice for Activity through 11_01_2023.pdf"

In [None]:
file_object = client.files.create(
  file=open(file_path, "rb"),
  purpose="assistants"
)

In [None]:
file_object

In [None]:
assistant = client.beta.assistants.create(
    name="Kaiser Bill Extraction Assistant",
    instructions=
    """
    You are a helpful assistant that is an expert at extracting structured data from Kaiser PDFs. 
    Specifically, you are a steeped in expertise as it comes to extracting information from a PDF and converting them into JSON. 
    You only respond in JSON.
    You will extract the employee_name (Titled "Name"), plan_type (this will be Medical) plan_name (titled "Medical Plan"), coverage_tier (titled "Coverage") and total_rate (titled "Total Due") for all active employees.
    We only want active employees rows of coverage returned. Employee Status can be found in the "Status" column.
    please use the instructions provided above to extract the bill data for each employee in the attached bill. There may be multiple tables of active employees if there are multiple plan types or multiple employee types. please make sure you check for other tables of active employees!
    """
    ,
    tools=[{"type": "retrieval"}],
    file_ids=[file_object.id],
    model="gpt-4-1106-preview"
)

In [None]:
assistant

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "extract the following pdf for active employees",
      "file_ids": [file_object.id]
    }
  ]
)

In [None]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

In [None]:
run

In [None]:
run = client.beta.threads.runs.retrieve(
  thread_id=thread.id,
  run_id=run.id
)
print(run)

In [None]:
run = client.beta.threads.create_and_run(
  assistant_id=assistant.id,
  thread={
    "messages": [
      {"role": "user", "content": "extract the following pdf for active employees"}
    ]
  }
)
print(run)

In [None]:
messages = client.beta.threads.messages.list(thread_id=thread.id)
print(messages)

In [None]:
import json
def show_json(obj):
    display(json.loads(obj.model_dump_json()))

In [None]:
show_json(messages)


In [None]:
dump = json.dumps(messages.data[0].content[0].text.value)

In [None]:
cleaned_data = dump.strip('"').replace('\\n', '\n').replace('```json\n', '').replace('```', '')
cleaned_data = cleaned_data.replace('\\"', '"')


In [None]:
json_data = json.loads(cleaned_data)


In [None]:
json_data