In [54]:
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
from google.cloud import storage
import pandas as pd
import time

# === CONFIG ===
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"

gcs_input_uri = "gs://vercillo_projects/transactions/amex/2025/2025-06-03.pdf"
gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"

client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
)
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# === GCS input/output config ===
input_config = documentai.BatchDocumentsInputConfig(
    gcs_documents=documentai.GcsDocuments(
        documents=[documentai.GcsDocument(gcs_uri=gcs_input_uri, mime_type="application/pdf")]
    )
)

output_config = documentai.DocumentOutputConfig(
    gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri
    )
)

# === Submit batch process ===
request = documentai.BatchProcessRequest(
    name=name,
    input_documents=input_config,
    document_output_config=output_config
)

operation = client.batch_process_documents(request)

print("Waiting for operation to finish...")
operation.result(timeout=300)

print("Document AI processing complete.")

Waiting for operation to finish...
Document AI processing complete.


In [64]:
import json

# === Locate the first JSON file in output path ===
storage_client = storage.Client()
output_bucket_name = gcs_output_uri.replace("gs://", "").split("/")[0]
output_prefix = "/".join(gcs_output_uri.replace("gs://", "").split("/")[1:])

blobs = list(storage_client.list_blobs(output_bucket_name, prefix=output_prefix))
json_blobs = [b for b in blobs if b.name.endswith(".json")]

if not json_blobs:
    raise ValueError("No JSON output found. Wait a few more seconds or check if the processor ran successfully.")

# Download and load the first JSON file
output_blob = json_blobs[0]
json_str = output_blob.download_as_text()
doc = json.loads(json_str)


In [79]:
full_text = doc.get("text", "")
pages = doc.get("pages", [])
rows = []

def find_y_from_tokens(start_idx):
    for page in pages:
        for token in page.get("tokens", []):
            segs = token["layout"]["textAnchor"].get("textSegments", [])
            if segs:
                token_start = int(segs[0].get("startIndex", -1))
                if token_start == start_idx:
                    return round(token["layout"]["boundingPoly"]["normalizedVertices"][0]["y"], 4), page["pageNumber"]
    return None, None

for entity in doc.get("entities", []):
    type_ = entity.get("type")
    value = entity.get("mentionText")
    confidence = round(entity.get("confidence", 0), 2)

    text_segments = entity.get("textAnchor", {}).get("textSegments", [{}])
    start_index = int(text_segments[0].get("startIndex", -1))
    end_index = int(text_segments[0].get("endIndex", -1))

    # Get Y and page by matching entity start index to token
    y_pos, page = find_y_from_tokens(start_index)

    rows.append({
        "type": type_,
        "value": value,
        "confidence": confidence,
        "page": page,
        "start_index": start_index,
        "end_index": end_index,
        "y_position": y_pos
    })

df = pd.DataFrame(rows)
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   type         208 non-null    object 
 1   value        208 non-null    object 
 2   confidence   208 non-null    float64
 3   page         208 non-null    int64  
 4   start_index  208 non-null    int64  
 5   end_index    208 non-null    int64  
 6   y_position   208 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 11.5+ KB
None


In [80]:
df.head(10)

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,UBER EATS,0.99,2,2717,2726,0.549
1,Vendor,SECURITY NATIONAL INSUR,0.98,2,2745,2768,0.5667
2,Vendor,PTZ INSURANCE SERVICES,1.0,2,2797,2819,0.586
3,Vendor,HORNER ESSO 0303,0.82,3,3018,3034,0.2398
4,Vendor,AMZN MKTP CA*NI1XI4C60,0.98,3,3053,3075,0.2575
5,Vendor,DC03 A-OK COMMISSARY &,0.98,3,3114,3136,0.2768
6,Vendor,DC03 A-OK COMMISSARY &,0.98,3,3163,3185,0.2953
7,Vendor,DC03 A-OK COMMISSARY &,0.96,3,3217,3239,0.3138
8,Vendor,LITTLE CAESARS #4999-00,0.93,3,3253,3276,0.3319
9,Vendor,LITTLE CAESARS #4999-00,0.95,3,3303,3326,0.3504


In [None]:
target_types = [
    "Vendor", "amount", "location", "payment",
    "posting_date", "transaction_date"
]

# Split: dedup these
df_dedup_target = df[df["type"].isin(target_types)].copy()

# Keep all other types untouched
df_other = df[~df["type"].isin(target_types)].copy()

# Deduplicate only target types
df_dedup_target = df_dedup_target.drop_duplicates(subset=["type", "page", "y_position"])

# Combine both
df_cleaned = pd.concat([df_dedup_target, df_other], ignore_index=True)

df_cleaned

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,UBER EATS,0.99,2,2717,2726,0.5490
1,Vendor,SECURITY NATIONAL INSUR,0.98,2,2745,2768,0.5667
2,Vendor,PTZ INSURANCE SERVICES,1.00,2,2797,2819,0.5860
3,Vendor,HORNER ESSO 0303,0.82,3,3018,3034,0.2398
4,Vendor,AMZN MKTP CA*NI1XI4C60,0.98,3,3053,3075,0.2575
...,...,...,...,...,...,...,...
202,transaction_date,May 18,1.00,4,5137,5143,0.3298
203,closing_date,"Jun 03, 2025",0.84,1,445,457,0.1611
204,opening_date,"May 04, 2025",0.95,1,419,431,0.1611
205,points_earned,2297,0.99,7,12994,12999,0.2705


In [92]:
# Step 1: Isolate transaction_date anchors
start_df = df_cleaned[df_cleaned["type"] == "transaction_date"].sort_values(by=["page", "start_index"]).reset_index(drop=True)
start_df["row_id"] = range(len(start_df))

start_df["range_start"] = start_df["start_index"]

start_df.head(10)

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position,row_id,range_start
0,transaction_date,May 4,1.0,2,2689,2694,0.549,0,2689
1,transaction_date,May 4,1.0,2,2733,2738,0.5675,1,2733
2,transaction_date,May 5,1.0,2,2785,2790,0.5856,2,2785
3,transaction_date,May 6,1.0,3,3035,3040,0.2398,3,3035
4,transaction_date,May 6,1.0,3,3090,3095,0.2583,4,3090
5,transaction_date,May 7,1.0,3,3108,3113,0.2768,5,3108
6,transaction_date,May 7,1.0,3,3157,3162,0.2953,6,3157
7,transaction_date,May 8,1.0,3,3205,3210,0.3138,7,3205
8,transaction_date,May 8,1.0,3,3285,3290,0.3324,8,3285
9,transaction_date,May 8,1.0,3,3335,3340,0.3513,9,3335


In [93]:
# Step 1: Isolate transaction_date end anchors
end_df = df_cleaned[df_cleaned["type"] == "amount"].sort_values(by=["page", "end_index"]).reset_index(drop=True)
end_df["row_id"] = range(len(end_df))

end_df["range_end"] = end_df["end_index"]

end_df.head(10)

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position,row_id,range_end
0,amount,13.3,1.0,2,2727,2732,0.5494,0,2732
1,amount,111.64,1.0,2,2778,2784,0.5679,1,2784
2,amount,21.64,1.0,2,2829,2834,0.5869,2,2834
3,amount,49.85,1.0,3,3047,3052,0.2402,3,3052
4,amount,86.82,1.0,3,3102,3107,0.2596,4,3107
5,amount,14.69,1.0,3,3151,3156,0.2772,5,3156
6,amount,5.65,1.0,3,3200,3204,0.2958,6,3204
7,amount,5.65,1.0,3,3248,3252,0.3138,7,3252
8,amount,9.03,1.0,3,3298,3302,0.3328,8,3302
9,amount,1.68,1.0,3,3348,3352,0.3513,9,3352


In [112]:
# Isolate transaction_date anchors
anchor_df = df_cleaned[df_cleaned["type"] == "transaction_date"].sort_values(by=["page", "start_index"]).reset_index(drop=True)
anchor_df["row_id"] = range(len(anchor_df))
anchor_df["range_start"] = anchor_df["start_index"]

# range_end = start_index of next transaction_date (on same page or next pages)
anchor_df["range_end"] = anchor_df["start_index"].shift(-1)
anchor_df["range_end"] = anchor_df["range_end"].fillna(1e9).astype(int)  # large number for last row

anchor_df.head()


Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position,row_id,range_start,range_end
0,transaction_date,May 4,1.0,2,2689,2694,0.549,0,2689,2733
1,transaction_date,May 4,1.0,2,2733,2738,0.5675,1,2733,2785
2,transaction_date,May 5,1.0,2,2785,2790,0.5856,2,2785,3035
3,transaction_date,May 6,1.0,3,3035,3040,0.2398,3,3035,3090
4,transaction_date,May 6,1.0,3,3090,3095,0.2583,4,3090,3108


In [111]:
exclude_from_row_id = ["closing_date", "opening_date", "points_earned", "points_redeemed", "payment"]


def assign_row_id(row):
    if row["type"] in exclude_from_row_id:
        return None  # Skip assignment
    for _, anchor in anchor_df.iterrows():
        if anchor["range_start"] <= row["start_index"] < anchor["range_end"]:
            return anchor["row_id"]
    return None


df_cleaned["row_id"] = df_cleaned.apply(assign_row_id, axis=1)
df_cleaned.head()


Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position,row_id
0,Vendor,UBER EATS,0.99,2,2717,2726,0.549,0.0
1,Vendor,SECURITY NATIONAL INSUR,0.98,2,2745,2768,0.5667,1.0
2,Vendor,PTZ INSURANCE SERVICES,1.0,2,2797,2819,0.586,2.0
3,Vendor,HORNER ESSO 0303,0.82,3,3018,3034,0.2398,2.0
4,Vendor,AMZN MKTP CA*NI1XI4C60,0.98,3,3053,3075,0.2575,3.0


In [108]:
# Local export path (make sure this folder exists)
local_csv = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2025-06-03_entitiesv4.csv"

# Save locally
df_cleaned.to_csv(local_csv, index=False)
print(f"Exported CSV saved to: {local_csv}")

Exported CSV saved to: C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2025-06-03_entitiesv4.csv
