In [1]:
import pandas as pd
import pdfplumber
from pdfplumber.utils.pdfinternals import resolve_and_decode, resolve
from pprint import pprint
import os
import io 
import zipfile
import plotly.express as px


### Select relevant data:

In [2]:
'''These are all of the fields which I think are relevant: Operator, time and place, vehicle details, damage report, 
other party details, description, mode, conditions. NOTE: many fields which are positive are marked with a BLANK " ", 
while negatives marked with "None".'''





all_fields = ['MANufACTuRERS NAME','BuSINESS NAME',
              'DATE Of ACCIDENT','Time of Accident','AM','PM',
              'VEhICLE YEAR','MAkE','MODEL',
              'section 2  accident infoRmation.0','section 2  accident infoRmation.1.0','section 2  accident infoRmation.1.1.0','section 2  accident infoRmation.1.1.1.0','section 2  accident infoRmation.1.1.1.1',
              'Moving', 'Stopped in Traffic', 'Pedestrian', 'Bicyclist', 'undefined', 'Other',
              'NuMBER Of VEhICLES INVOLVED',
              'Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',
              'Moving_2', 'Stopped in Traffic_2','Pedestrian_2','Bicyclist_2','undefined_2','Other_2',
              'ADDRESS_2.1.0.1','Autonomous Mode','Conventional Mode',
              'WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES','OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

conditions_only = ['WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES''OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

damage_only = ['Autonomous Mode','Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',]

geo_only = ['Operator','Business',
              'Date_of_Accident','Time_of_accident','AM','PM',
              'location', 'city', 'county',
              'state','zip']

description_cols = ["location", "In_autonomy", 'Description']


# Rename columns:
rename_dict = {"Section 1 Manufacturers information. enter manufacturer's NAME": 'Operator',
               "enter BUSINESS NAME": 'Business',
               "Section 2. Accident information vehicle one enter DATE Of ACCIDENT": "Date_of_Accident",
               "enter time of accident": "Time_of_accident",
               "Time of accident. Mark if Ay M.":"AM",
               "Mark if P M.":"PM",
               "enter address and location of accident.":"location",
               "enter city of accident":"city",
               "enter county of accident.":"county",
               "enter state of accident.":"state",
               "enter zip code of accident.":"zip",
               "describe accident details.":"Description",
               "section 5. accident details description. Mark if Autonomous Mode.":"In_autonomy",
               "Mark if Conventional Mode":"Conventional_mode"}



In [None]:
# This function read the filed name, alternate field name, and field values
def parse_field_helper(form_data, field, prefix=None):
    """ appends any PDF AcroForm field/value pairs in `field` to provided `form_data` list

        if `field` has child fields, those will be parsed recursively.
    """
    resolved_field = field.resolve()
    field_name = '.'.join(filter(lambda x: x, [prefix, resolve_and_decode(resolved_field.get("T"))]))
    if "Kids" in resolved_field:
        for kid_field in resolved_field["Kids"]:
            parse_field_helper(form_data, kid_field, prefix=field_name)
    if "T" in resolved_field or "TU" in resolved_field:
        # "T" is a field-name, but it's sometimes absent.
        # "TU" is the "alternate field name" and is often more human-readable
        # your PDF may have one, the other, or both.
        alternate_field_name = resolve_and_decode(resolved_field.get("TU")) if resolved_field.get("TU") else None
        field_value = resolve_and_decode(resolved_field["V"]) if 'V' in resolved_field else None

        # Remove non-printable characters and trailing spaces - This affects every Cruise file.
        field_name = ''.join(char for char in field_name if char.isprintable()).strip()
        alternate_field_name = ''.join(char for char in alternate_field_name if char.isprintable()).strip() if alternate_field_name else None
        field_value = ''.join(char for char in field_value if char.isprintable()).strip() if field_value else None
        
        form_data.append([field_name, alternate_field_name, field_value])

# Define filtering criteria 
def find_important_tuples(tuple, search_condition):
    # Check if the first element of the tuple is in the search condition list
    return tuple[0] in search_condition

# Loop over all pdfs and add entries.  Keyword can search foles for specific name
def extract_from_zip(zip_file_path, list_of_pdf_fields, keyword=None):
    collisions = []
    counter = 0
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for filename in zip_ref.namelist():
            # Check if the current item is a pdf file
            if filename.endswith('.pdf') and (keyword is None or keyword in filename):
                print(f'Extracting: {filename}...')
                # Read PDF from zip file
                with zip_ref.open(filename, 'r') as pdf_file:
                    pdf_data = io.BytesIO(pdf_file.read())

                # Open each pdf
                pdf = pdfplumber.open(pdf_data)
                form_data = []
                # identify fields
                fields = resolve(pdf.doc.catalog["AcroForm"])["Fields"]
                # For each field, run the pdf parsing function to extract adta and add it to form_data list
                for field in fields:
                    parse_field_helper(form_data, field)

                # Filter the long list of tuples [all_fields, geo_only, conditions_only, damage_only]
                filtered_list = [tuple for tuple in form_data if find_important_tuples(tuple, list_of_pdf_fields)]
                
                data_dict = {}
                # Set df sturcture so each pdf is one row - alt_text is column name, value is vlaue]
                # Populate the dictionary with values from filtered_list
                for tuple in filtered_list:
                    column_name = tuple[1]  # ---------------------  [0] = Field name, [1] = Alt name, switching sometimes helpful but may require changing lists above.
                    row_value = tuple[2]
                    data_dict[column_name] = row_value
                
                # Create DataFrame from the dictionary
                collision_report = pd.DataFrame([data_dict])
                collisions.append(collision_report)
                print('Done.')
                counter += 1
    
    print(f"Extracted data from {counter} collision reports.")
    df = pd.concat(collisions)
    df.reset_index(drop=True, inplace=True)
    return df

def clean_bools(df):
    for column in df.columns:
        # Replace empty strings and None with specific values
        df[column] = df[column].replace({"": 1, None: 0, "Yes": 1, "Off": 0})

    return df


In [None]:
path_to_zip = "data/collisions/Collision_PDFs.zip"
# read in all the data once.
all_collisions_df = extract_from_zip(path_to_zip, all_fields)
all_collisions_df.head()


In [None]:
# Rename columns
all_collisions_df.rename(columns=rename_dict, inplace=True)

# Fill Mfg name
all_collisions_df['Operator'] = all_collisions_df['Operator'].fillna(all_collisions_df['Business'])

all_rows = all_collisions_df.shape[0]

# Drop rows not in autonomy
auto_collisions_df = all_collisions_df[all_collisions_df['Conventional_mode'] != 1]

# Calculate the number of rows dropped
non_autonomy = all_rows - auto_collisions_df.shape[0]
print(f"Number of rows not in autonomy dropped: {non_autonomy}")


# TODO: Change the conditions column names (light, weather, etc) into the alt_filed name (tuple[1]) for human readability. - NOTE: partially solved by using Alt_text instead of text. Still should be cleaned



In [None]:
# Count the occurrences of True values in each column
mode_counts = all_collisions_df[['In_autonomy', 'Conventional_mode']].sum()

# Create a bar plot using Plotly Express
fig = px.bar(x=mode_counts.index, y=mode_counts.values, labels={'x': 'Autonomy', 'y': 'Count'}, 
             title='Collision Records in Autonomy', color=mode_counts.index,
             color_discrete_sequence=px.colors.sequential.Viridis)
fig.write_image("Images/In_autonomy.jpg")
fig.show()

In [None]:
auto_collisions_df.head(10)

In [None]:
# Fresh locations dataframe
collision_locations = auto_collisions_df[geo_only]
collision_locations.head()

In [None]:
# Rename columns:
collision_locations.rename(columns=rename_dict, inplace=True)

# Fix date times
collision_locations['Date_of_Accident'] = pd.to_datetime(collision_locations['Date_of_Accident'], errors='coerce')

# Get the initial number of rows
initial_rows = collision_locations.shape[0]

# Drop NaN values in 'Time_of_accident' column with an empty string
collision_locations.dropna(subset=['Time_of_accident'], inplace=True)
# Find additional bad time data
# unique_times = collision_locations['Time_of_accident'].astype(str).unique()
# Sort the unique times and print them
# sorted_unique_times = sorted(unique_times)
# print(sorted_unique_times)
# Drop Zoox 06_08_2022 where no time is entered.
collision_locations = collision_locations[collision_locations['Time_of_accident'] != 0]

# Calculate the number of rows dropped
rows_dropped = initial_rows - collision_locations.shape[0]
print(f"Number of rows dropped: {rows_dropped}")


# Fix Woven Planet Empty fields
collision_locations.loc[collision_locations['Operator'] == 0, ["Operator", "Business"]] = "Woven Planet"

# Fix Ghost Autonomy fields
collision_locations.loc[collision_locations['Business'] == 'same', ["Operator", "Business"]] = "Ghost Autonomy"




In [None]:

# Convert 'Time_of_accident' column to string type
collision_locations['Time_of_accident'] = collision_locations['Time_of_accident'].astype(str)
collision_locations[['hour', 'minute']] = collision_locations['Time_of_accident'].str.extract(r'(\d+):(\d+)')
collision_locations[['hour', 'minute']] = collision_locations[['hour', 'minute']].astype(int)

# Add 12 hours to hour component if PM is True and hour is less than 12
collision_locations['hour'] = collision_locations.apply(lambda row: row['hour'] + 12 if row['PM'] == 1 and row['hour'] < 12 else row['hour'], axis=1)

#Rehoin hours and mins into 24hr time
collision_locations['Time_of_accident_24hr'] = pd.to_datetime(collision_locations[['hour', 'minute']].astype(str).agg(':'.join, axis=1), format='%H:%M').dt.time
# Drop calc fileds
collision_locations.drop(columns=(['Time_of_accident', 'AM', 'PM', 'hour', 'minute']),inplace=True)

collision_locations.head(10)





In [None]:
# export as csv
# Specify the full path where you want to save the CSV file
csv_file_path = 'data/collisions/dataframes/locations/crash_locations.csv'

# Save DataFrame to CSV
collision_locations.to_csv(csv_file_path, index=False)
collision_locations.tail(10)


In [None]:
# Fresh locations dataframe
descriptions_df = auto_collisions_df[description_cols]
descriptions_df.head()

In [None]:
descriptions_df.rename(columns=rename_dict, inplace=True)

descriptions_df.to_csv('data/collisions/dataframes/descriptions/descriptions.csv')


In [None]:
# @JackP TODO: Collision heatmap and severity

# Read in damage data by tuple[0], filed name:
damage_df = extract_from_zip(path_to_zip, damage_only)
damage_df.head()

In [None]:
# TODO: Collisions & disengagements grouped bar by mfg (on main)
damage_df = clean_bools(damage_df)

damage_df.head()
damage_df.to_csv('data/collisions/dataframes/damage_map/damage.csv')
