In [None]:
import pandas as pd
import pdfplumber
from pdfplumber.utils.pdfinternals import resolve_and_decode, resolve
from pprint import pprint
import os



In [None]:
def parse_field_helper(form_data, field, prefix=None):
    """ appends any PDF AcroForm field/value pairs in `field` to provided `form_data` list

        if `field` has child fields, those will be parsed recursively.
    """
    resolved_field = field.resolve()
    field_name = '.'.join(filter(lambda x: x, [prefix, resolve_and_decode(resolved_field.get("T"))]))
    if "Kids" in resolved_field:
        for kid_field in resolved_field["Kids"]:
            parse_field_helper(form_data, kid_field, prefix=field_name)
    if "T" in resolved_field or "TU" in resolved_field:
        # "T" is a field-name, but it's sometimes absent.
        # "TU" is the "alternate field name" and is often more human-readable
        # your PDF may have one, the other, or both.
        alternate_field_name  = resolve_and_decode(resolved_field.get("TU")) if resolved_field.get("TU") else None
        field_value = resolve_and_decode(resolved_field["V"]) if 'V' in resolved_field else None
        form_data.append([field_name, alternate_field_name, field_value])

# Define filtering criteria 
def find_important_tuples(tuple, search_condition):
    # Check if the first element of the tuple is in the search condition list
    return tuple[0] in search_condition


### Select relevant data:

In [26]:
'''These are all of the fields which I think are relevant: Operator, time and place, vehicle details, damage report, 
other party details, description, mode, conditions. NOTE: many fields which are positive are marked with a BLANK " ", 
while negatives marked with "None".'''

all_fields = ['MANufACTuRERS NAME','BuSINESS NAME',
              'DATE Of ACCIDENT','Time of Accident','AM','PM',
              'VEhICLE YEAR','MAkE','MODEL',
              'section 2  accident infoRmation.0','section 2  accident infoRmation.1.0','section 2  accident infoRmation.1.1.0','section 2  accident infoRmation.1.1.1.0','section 2  accident infoRmation.1.1.1.1',
              'Moving', 'Stopped in Traffic', 'Pedestrian', 'Bicyclist', 'undefined', 'Other',
              'NuMBER Of VEhICLES INVOLVED',
              'Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',
              'Moving_2', 'Stopped in Traffic_2','Pedestrian_2','Bicyclist_2','undefined_2','Other_2',
              'ADDRESS_2.1.0.1','Autonomous Mode','Conventional Mode',
              'WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES','OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

conditions_only = ['WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES''OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

damage_only = ['Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',]

geo_only = ['MANufACTuRERS NAME','BuSINESS NAME',
              'DATE Of ACCIDENT','Time of Accident','AM','PM',
              'section 2  accident infoRmation.0','section 2  accident infoRmation.1.0','section 2  accident infoRmation.1.1.0',
              'section 2  accident infoRmation.1.1.1.0','section 2  accident infoRmation.1.1.1.1']




In [54]:
# Row-wise approach
# Files location
collisions_path = "data/collisions"
# Use this to loop over all pdfs and add entries.  Should rename columns, as many are only clearly described by their "alt_text" dropped here.

# List of collisions
collisions = []
# Create an empty dictionary to store data for DataFrame

for filename in os.listdir(collisions_path):
    print(f'{collisions_path}/{filename}')
    # Check if the current item is a file (not a subdirectory)
    if os.path.isfile(os.path.join(collisions_path, filename)):
        data_dict = {}

        # Open each pdf
        pdf = pdfplumber.open(f'{collisions_path}/{filename}')
        # initialize fom data list
        form_data = []
        # identify fields
        fields = resolve(pdf.doc.catalog["AcroForm"])["Fields"]

        # For each field, run the pdf parsing function to extract adta and add it to form_data list
        for field in fields:
            parse_field_helper(form_data, field)
            
        # Filter the long list of tuples [all_fields, geo_only, conditions_only, damage_only]
        filtered_list = [tuple for tuple in form_data if find_important_tuples(tuple, geo_only)]

        # Set df sturcture so each pdf is one row - alt_text is column name, value is vlaue]
        # Populate the dictionary with values from filtered_list
        for tuple in filtered_list:
            column_name = tuple[1]
            row_value = tuple[2]
            data_dict[column_name] = row_value

        # Create DataFrame from the dictionary
        collision_report = pd.DataFrame([data_dict])
        collisions.append(collision_report)

collisions_report = pd.concat(collisions)    
# Reset the index of the DataFrame
collisions_report.reset_index(drop=True, inplace=True)




data/collisions/Apollo-OL316-101623-Redacted.pdf
data/collisions/Apple-OL316-021524-Redacted.pdf
data/collisions/Apple-OL316-101323-Redacted.pdf
data/collisions/Apple-OL316-102323-Redacted-1.pdf
data/collisions/Apple_012624.pdf
data/collisions/Cruise-OL316-100623-Redacted.pdf
data/collisions/Cruise-OL316-102723-Redacted-1.pdf
data/collisions/Ghost-OL316-011124-Redacted.pdf
data/collisions/Mercedes-Benz-OL316-112923-Redacted.pdf
data/collisions/Nuro-OL316-011224-Redacted.pdf
data/collisions/Nuro-OL316-031224-Redacted.pdf
data/collisions/Nuro-OL316-112723-Redacted.pdf
data/collisions/Nuro_013124.pdf
data/collisions/Waymo-OL316-010824-Redacted.pdf
data/collisions/Waymo-OL316-020524-Redacted.pdf
data/collisions/Waymo-OL316-021824-Redacted.pdf
data/collisions/Waymo-OL316-030224-Redacted.pdf
data/collisions/Waymo-OL316-030824-Redacted.pdf
data/collisions/Waymo-OL316-031424-Redacted.pdf
data/collisions/Waymo-OL316-101723-Redacted.pdf
data/collisions/Waymo-OL316-102623-Redacted-1.pdf
data/coll

In [55]:


# Something wrong with the Cruise values metadata in pdf which causes failure. Can add manually. Hopefully this isn't for all?

# Rename columns:
rename_dict = {"Section 1. Manufacturers information. Enter manufacturer's name": 'Operator',
               "enter BUSINESS NAME": 'Business',
               "Section 2. Accident information vehicle one enter DATE Of ACCIDENT": "Date_of_Accident",
               "enter time of accident": "Time_of_accident",
               "Time of accident. Mark if Ay M.":"AM",
               "Mark if P M.":"PM",
               "enter address and location of accident.":"address",
               "enter city of accident":"city",
               "enter county of accident.":"county",
               "enter state of accident.":"state",
               "enter zip code of accident.":"zip"}

collisions_report.rename(columns=rename_dict, inplace=True)

# Fix date times
collisions_report['Date_of_Accident'] = pd.to_datetime(collisions_report['Date_of_Accident'], errors='coerce')

# Convert 'Time_of_accident' column to string type
collisions_report['Time_of_accident'] = collisions_report['Time_of_accident'].astype(str)

# Update time format based on AM and PM columns
for index, row in collisions_report.iterrows():
    if row['AM'] == ' ':
        collisions_report.at[index, 'Time_of_accident'] += ' AM'
    elif row['PM'] == ' ':
        collisions_report.at[index, 'Time_of_accident'] += ' PM'

# Remove whitespace from time column
collisions_report['Time_of_accident'] = collisions_report['Time_of_accident'].str.strip()
collisions_report.drop(columns=['AM', 'PM'], inplace=True)




In [56]:

collisions_report.head(40)


Unnamed: 0,Operator,Business,Date_of_Accident,Time_of_accident,address,city,county,state,zip
0,Apollo Autonomous Driving USA LLC,Apollo Autonomous Driving USA LLC,2023-10-16,12:02 PM,1901 46th Avenue,San Francisco,San Francisco,CA,94116.0
1,Apple Inc.,Apple Inc.,2024-02-15,12:03 PM,US 101N at Ellis St Exit,Sunnyvale,Santa Clara,CA,94089.0
2,Apple Inc.,Apple Inc.,2023-10-13,1:52 PM,I-880N near Brokaw Rd Exit,San Jose,Santa Clara,CA,95131.0
3,Apple Inc.,Apple Inc.,2023-10-23,05:06 PM,Lawrence Expressway and Kifer Road,Santa Clara,Santa Clara,CA,95051.0
4,Apple Inc.,Apple Inc.,2024-01-26,11:06 AM,I-580E and I-238 Junction,Castro Valley,Alameda,CA,94546.0
5,Cruise LLC,Cruise,NaT,nan AM,,,,,
6,Cruise LLC,Cruise,NaT,nan AM,,,,,
7,Ghost Autonomy Inc.,Ghost Autonomy Inc.,2024-01-11,4:48 PM,HWY 17 Northbound near Hamman Park,San Jose,Santa Clara,CA,95128.0
8,MERCEDES-BENZ RESEARCH & DEVELOPMENT NORTH AME...,MERCEDES-BENZ RESEARCH & DEVELOPMENT NORTH AME...,2023-11-29,5:00 PM,"405S, BELLFLOWER EXIT",LONG BEACH,LOS ANGELES,CA,90810.0
9,Nuro Inc,Nuro Inc,2024-01-12,2:30 PM,Cross street: Terra Bella and Shoreline,Mountain View,Santa Clara,CA,94043.0


In [None]:
# TODO: Need to change " "s into True and "None" into False
# TODO: Change the conditions column names (light, weather, etc) into the alt_filed name (tuple[1]) for human readability.


In [None]:
# TODO: @Kyle - create descriptions dataframe. Save and read into main for wordcloud
# columns (descriptions only) 
# TODO: @kyle - create dataframe with: date, address, manufacturer for geo mapping

In [None]:
# @JackP TODO: Collision heatmap and severity

In [None]:
# TODO: Create Address/Geo dataframe, save and read into main for geographic heat map
# TODO: Geo scatter a) color by year, b) color by mfg

In [None]:
# TODO: 
# TODO: Collisions & disengagements by mfg (on main)
