In [3]:
import pandas as pd
import pdfplumber
from pdfplumber.utils.pdfinternals import resolve_and_decode, resolve
from pprint import pprint
import os
import io 
import zipfile
import plotly.express as px


### Select relevant data:

In [4]:
'''These are all of the fields which I think are relevant: Operator, time and place, vehicle details, damage report, 
other party details, description, mode, conditions. NOTE: many fields which are positive are marked with a BLANK " ", 
while negatives marked with "None".'''





all_fields = ['MANufACTuRERS NAME','BuSINESS NAME',
              'DATE Of ACCIDENT','Time of Accident','AM','PM',
              'VEhICLE YEAR','MAkE','MODEL',
              'section 2  accident infoRmation.0','section 2  accident infoRmation.1.0','section 2  accident infoRmation.1.1.0','section 2  accident infoRmation.1.1.1.0','section 2  accident infoRmation.1.1.1.1',
              'Moving', 'Stopped in Traffic', 'Pedestrian', 'Bicyclist', 'undefined', 'Other',
              'NuMBER Of VEhICLES INVOLVED',
              'Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',
              'Moving_2', 'Stopped in Traffic_2','Pedestrian_2','Bicyclist_2','undefined_2','Other_2',
              'ADDRESS_2.1.0.1','Autonomous Mode','Conventional Mode',
              'WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES','OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

conditions_only = ['WEATHER A 1','WEATHER A 2','WEATHER B 1','WEATHER B 2','WEATHER C 1','WEATHER C 2',
              'WEATHER D 1','WEATHER D 2','WEATHER E 1','WEATHER E 2','WEATHER F 1','WEATHER F 2','WEATHER G 1','WEATHER G 2', 
              'LIGHTING A 1','LIGHTING A 2','LIGHTING B 1','LIGHTING B 2','LIGHTING C 1','LIGHTING C 2',
              'LIGHTING D 1','LIGHTING D 2','LIGHTING E 1','LIGHTING E 2',
              'ROADWAY A 1','ROADWAY A 2','ROADWAY B 1','ROADWAY B 2','ROADWAY C 1', 'ROADWAY C 2','ROADWAY D 1','ROADWAY D 2', 
              'ROAD CONDITIONS A 1','ROAD CONDITIONS A 2','ROAD CONDITIONS B 1','ROAD CONDITIONS B 2', 'ROAD CONDITIONS C 1','ROAD CONDITIONS C 2',
              'ROAD CONDITIONS D 1','ROAD CONDITIONS D 2','ROAD CONDITIONS E 1', 'ROAD CONDITIONS E 2', 'ROAD CONDITIONS F 1', 'ROAD CONDITIONS F 2', 
              'ROAD CONDITIONS G 1', 'ROAD CONDITIONS G 2', 'ROAD CONDITIONS H 1', 'ROAD CONDITIONS H 2',
              'MOVEMENT A 1','MOVEMENT A 2','MOVEMENT  B 1', 'MOVEMENT  B 2','MOVEMENT C 1','MOVEMENT C 2', 'MOVEMENT  D 1','MOVEMENT  D 2', 
              'MOVEMENT  E 1','MOVEMENT  E 2', 'MOVEMENT  F 1', 'MOVEMENT  F 2', 'MOVEMENT  G 1','MOVEMENT  G 2', 'MOVEMENT  H 1','MOVEMENT  H 2', 
              'MOVEMENT  I 1', 'MOVEMENT  I 2', 'MOVEMENT J 1', 'MOVEMENT J 2', 'MOVEMENT  K 1', 'MOVEMENT  K 2', 'MOVEMENT  L 1', 'MOVEMENT  L 2',
              'MOVEMENT  M 1', 'MOVEMENT  M 2','MOVEMENT  N 1', 'MOVEMENT  N 2', 'MOVEMENT  O 1', 'MOVEMENT  O 2',
              'MOVEMENT  P 1', 'MOVEMENT  P 2', 'MOVEMENT  Q 1', 'MOVEMENT  Q 2', 'MOVEMENT  R 1', 'MOVEMENT  R 2',
              'TYPE A 1', 'TYPE A 2', 'TYPE B 1', 'TYPE B 2','TYPE C 1','TYPE C 2','TYPE D 1','TYPE D 2','TYPE E 1','TYPE E 2','TYPE F 1','TYPE F 2','TYPE G 1','TYPE G 2','TYPE H 1','TYPE H 2',
              'OTHER A YES''OTHER A NO','OTHER B','OTHER C','OTHER D','OTHER E','OTHER F','OTHER G',
              'OTHER H YES','OTHER H NO','OTHER I','OTHER J','OTHER K','OTHER L']

damage_only = ['Unknown','None','minor','Moderate','major',
              'Left Rear 1','Rear Bumper','Right Rear 1','Left Rear 2','Left Rear 3','Right Rear 2','Right Rear 3',
              'Left Rear Passenger 1','Left Rear Passenger 2','Right Rear Passenger 1','Right Rear Passenger 2',
              'Left Rear Passenger 3','Left Rear Passenger 4','Right Rear Passenger 3','Right Rear Passenger 4',
              'Front Driver Side 1','Front Driver Side 2','Front Passenger Side 1','Front Passenger Side 2',
              'Front Driver Side 3','Front Driver Side 4','Front Passenger Side 3','Front Passenger Side 4',
              'Left Front Corner 1','Left Front Corner 2','Right Front Corner 1', 'Right Front Corner 2',
              'Left Front Corner 3','Front Bumper','Right Front Corner 3',]

geo_only = ['Operator','Business',
              'Date_of_Accident','Time_of_accident','AM','PM',
              'location', 'city', 'county',
              'state','zip']

description_cols = ["location", "In_autonomy", 'Description']


# Rename columns:
rename_dict = {"Section 1 Manufacturers information. enter manufacturer's NAME": 'Operator',
               "enter BUSINESS NAME": 'Business',
               "Section 2. Accident information vehicle one enter DATE Of ACCIDENT": "Date_of_Accident",
               "enter time of accident": "Time_of_accident",
               "Time of accident. Mark if Ay M.":"AM",
               "Mark if P M.":"PM",
               "enter address and location of accident.":"location",
               "enter city of accident":"city",
               "enter county of accident.":"county",
               "enter state of accident.":"state",
               "enter zip code of accident.":"zip",
               "describe accident details.":"Description",
               "section 5. accident details description. Mark if Autonomous Mode.":"In_autonomy",
               "Mark if Conventional Mode":"Conventional_mode"}



In [5]:
# This function read the filed name, alternate field name, and field values
def parse_field_helper(form_data, field, prefix=None):
    """ appends any PDF AcroForm field/value pairs in `field` to provided `form_data` list

        if `field` has child fields, those will be parsed recursively.
    """
    resolved_field = field.resolve()
    field_name = '.'.join(filter(lambda x: x, [prefix, resolve_and_decode(resolved_field.get("T"))]))
    if "Kids" in resolved_field:
        for kid_field in resolved_field["Kids"]:
            parse_field_helper(form_data, kid_field, prefix=field_name)
    if "T" in resolved_field or "TU" in resolved_field:
        # "T" is a field-name, but it's sometimes absent.
        # "TU" is the "alternate field name" and is often more human-readable
        # your PDF may have one, the other, or both.
        alternate_field_name = resolve_and_decode(resolved_field.get("TU")) if resolved_field.get("TU") else None
        field_value = resolve_and_decode(resolved_field["V"]) if 'V' in resolved_field else None

        # Remove non-printable characters and trailing spaces - This affects every Cruise file.
        field_name = ''.join(char for char in field_name if char.isprintable()).strip()
        alternate_field_name = ''.join(char for char in alternate_field_name if char.isprintable()).strip() if alternate_field_name else None
        field_value = ''.join(char for char in field_value if char.isprintable()).strip() if field_value else None
        
        form_data.append([field_name, alternate_field_name, field_value])

# Define filtering criteria 
def find_important_tuples(tuple, search_condition):
    # Check if the first element of the tuple is in the search condition list
    return tuple[0] in search_condition

# Loop over all pdfs and add entries.  Keyword can search foles for specific name
def extract_from_zip(zip_file_path, list_of_pdf_fields, keyword=None):
    collisions = []
    counter = 0
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for filename in zip_ref.namelist():
            # Check if the current item is a pdf file
            if filename.endswith('.pdf') and (keyword is None or keyword in filename):
                print(f'Extracting: {filename}...')
                # Read PDF from zip file
                with zip_ref.open(filename, 'r') as pdf_file:
                    pdf_data = io.BytesIO(pdf_file.read())

                # Open each pdf
                pdf = pdfplumber.open(pdf_data)
                form_data = []
                # identify fields
                fields = resolve(pdf.doc.catalog["AcroForm"])["Fields"]
                # For each field, run the pdf parsing function to extract adta and add it to form_data list
                for field in fields:
                    parse_field_helper(form_data, field)

                # Filter the long list of tuples [all_fields, geo_only, conditions_only, damage_only]
                filtered_list = [tuple for tuple in form_data if find_important_tuples(tuple, list_of_pdf_fields)]
                
                data_dict = {}
                # Set df sturcture so each pdf is one row - alt_text is column name, value is vlaue]
                # Populate the dictionary with values from filtered_list
                for tuple in filtered_list:
                    column_name = tuple[1]
                    row_value = tuple[2]
                    data_dict[column_name] = row_value
                
                # Create DataFrame from the dictionary
                collision_report = pd.DataFrame([data_dict])
                collisions.append(collision_report)
                print('Done.')
                counter += 1
    
    print(f"Extracted data from {counter} collision reports.")
    df = pd.concat(collisions)
    df.reset_index(drop=True, inplace=True)
    return df


In [6]:
path_to_zip = "data/collisions/Collision_PDFs.zip"
# read in all the data once.
all_collisions_df = extract_from_zip(path_to_zip, all_fields)
all_collisions_df.head()


Extracting: Aimotive_091619.pdf...
Done.
Extracting: Apollo-OL316-062923-Redacted.pdf...
Done.
Extracting: Apollo-OL316-090623-Redacted.pdf...
Done.
Extracting: Apollo-OL316-101623-Redacted.pdf...
Done.
Extracting: Apple_012624.pdf...
Done.
Extracting: Apple_022123.pdf...
Done.
Extracting: apple_081921.pdf...
Done.
Extracting: apple_082321.pdf...
Done.
Extracting: Apple_091919.pdf...
Done.
Extracting: Apple_101022.pdf...
Done.
Extracting: Apple_120621.pdf...
Done.
Extracting: Apple_122022.pdf...
Done.
Extracting: Apple_OL316_061422_Redacted.pdf...
Done.
Extracting: apple_OL316_092721_Redacted.pdf...
Done.
Extracting: Apple-OL316_021722_Redacted.pdf...
Done.
Extracting: Apple-OL316_022222_Redacted.pdf...
Done.
Extracting: Apple-OL316_031522_Redacted.pdf...
Done.
Extracting: Apple-OL316_102022_Redacted.pdf...
Done.
Extracting: Apple-OL316_111121_Redacted.pdf...
Done.
Extracting: Apple-OL316-021524-Redacted.pdf...
Done.
Extracting: Apple-OL316-051123-Redacted.pdf...
Done.
Extracting: Appl

Unnamed: 0,Section 1 Manufacturers information. enter manufacturer's NAME,enter BUSINESS NAME,Section 2. Accident information vehicle one enter DATE Of ACCIDENT,enter time of accident,Time of accident. Mark if Ay M.,Mark if P M.,Enter vehicle year.,enter make.,enter MODEL,enter address and location of accident.,...,"ROADWAY SURFACE. VEHICLE 2, MARK IF DRY","ROADWAY CONDITIONS, MARK ONE TO TWO ITEMS. VEHICLE 1, MARK IF HOLES DEEP RUT ASTERISK","ROADWAY CONDITIONS, MARK ONE TO TWO ITEMS. VEHICLE 2, MARK IF HOLES DEEP RUT ASTERISK","MOVEMENT PRECEDING COLLISION. VEHICLE 1, MARK IF STOPPED","MOVEMENT PRECEDING COLLISION. VEHICLE 2, MARK IF STOPPED","TYPE OF COLLISION. VEHICLE 1, MARK IF HEAD ON","TYPE OF COLLISION. VEHICLE 2, MARK IF HEAD ON","OTHER ASSOCIATED FACTOR OR FACTORS. MARK ALL APPLICABLE. C V C SECTIONS VIOLATED. CITED, MARK IF YES","DEFECTIVE, W E H EQUIPMENT. CITED, MARK IF YES","MARK IF OTHER, ASTERISK"
0,"Aimotive, Inc","Aimotive, Inc.",09/16/2019,10:00,,,2010,Toyota,Prius,"SB, 101 US Freeway, Old Middlefield Rd x Shorline",...,,,,,,,,,,
1,,Apollo Autonomous Driving USA LLC,06/29/2023,3:39,,,2020,Chrysler,Pacifica,Moraga St near 34th Ave,...,Yes,,,,,,Yes,,,
2,,Apollo Autonomous Driving USA LLC,9/6/2023,12:25,,,2017,Lincoln,MKZ Hybrid,2037 Rivera Street,...,Yes,,,,Yes,,,,,
3,,Apollo Autonomous Driving USA LLC,10/16/2023,12:02,,,2016,Lincoln,MKZ Hybrid,1901 46th Avenue,...,,,,,,,,,,
4,,Apple Inc.,01/26/2024,11:06,,,2017,Lexus,RX 450h,I-580E and I-238 Junction,...,,,,,,,,,,


In [7]:
# Rename columns
all_collisions_df.rename(columns=rename_dict, inplace=True)

# Fill Mfg name
all_collisions_df['Operator'] = all_collisions_df['Operator'].fillna(all_collisions_df['Business'])

# TODO: Need to change " "s into True and "None" into False 
# Iterate over each column 
for column in all_collisions_df.columns:
    # Replace empty strings and None with specific values
    all_collisions_df[column] = all_collisions_df[column].replace({"": 1, None: 0, "Yes": 1, "Off": 0})



all_rows = all_collisions_df.shape[0]

# Drop rows not in autonomy
auto_collisions_df = all_collisions_df[all_collisions_df['Conventional_mode'] != 1]

# Calculate the number of rows dropped
non_autonomy = all_rows - auto_collisions_df.shape[0]
print(f"Number of rows not in autonomy dropped: {non_autonomy}")



# TODO: Change the conditions column names (light, weather, etc) into the alt_filed name (tuple[1]) for human readability. - NOTE: partially solved by using Alt_text instead of text. Still should be cleaned



Number of rows not in autonomy dropped: 261


  all_collisions_df[column] = all_collisions_df[column].replace({"": 1, None: 0, "Yes": 1, "Off": 0})


In [136]:
# Count the occurrences of True values in each column
mode_counts = all_collisions_df[['In_autonomy', 'Conventional_mode']].sum()

# Create a bar plot using Plotly Express
fig = px.bar(x=mode_counts.index, y=mode_counts.values, labels={'x': 'Autonomy', 'y': 'Count'}, 
             title='Collision Records in Autonomy', color=mode_counts.index,
             color_discrete_sequence=px.colors.sequential.Viridis)
fig.write_image("Images/In_autonomy.jpg")
fig.show()

In [None]:
# Category chart
# Plot data using parallel_categories
px.parallel_categories(
    all_collisions_df,
    dimensions=["type", "region", "prop_size"],
    color="sold",
    color_continuous_scale=px.colors.sequential.Inferno,
    labels={
        "type": "Type of Dwelling",
        "region": "Region",
        "prop_size": "Property Size",
    },
)

In [137]:
auto_collisions_df.head(10)

Unnamed: 0,Operator,Business,Date_of_Accident,Time_of_accident,AM,PM,Enter vehicle year.,enter make.,enter MODEL,location,...,"ROADWAY SURFACE. VEHICLE 2, MARK IF DRY","ROADWAY CONDITIONS, MARK ONE TO TWO ITEMS. VEHICLE 1, MARK IF HOLES DEEP RUT ASTERISK","ROADWAY CONDITIONS, MARK ONE TO TWO ITEMS. VEHICLE 2, MARK IF HOLES DEEP RUT ASTERISK","MOVEMENT PRECEDING COLLISION. VEHICLE 1, MARK IF STOPPED","MOVEMENT PRECEDING COLLISION. VEHICLE 2, MARK IF STOPPED","TYPE OF COLLISION. VEHICLE 1, MARK IF HEAD ON","TYPE OF COLLISION. VEHICLE 2, MARK IF HEAD ON","OTHER ASSOCIATED FACTOR OR FACTORS. MARK ALL APPLICABLE. C V C SECTIONS VIOLATED. CITED, MARK IF YES","DEFECTIVE, W E H EQUIPMENT. CITED, MARK IF YES","MARK IF OTHER, ASTERISK"
1,Apollo Autonomous Driving USA LLC,Apollo Autonomous Driving USA LLC,06/29/2023,3:39,0,1,2020,Chrysler,Pacifica,Moraga St near 34th Ave,...,1,0,0,0,0,0,1,0,0,0
4,Apple Inc.,Apple Inc.,01/26/2024,11:06,1,0,2017,Lexus,RX 450h,I-580E and I-238 Junction,...,0,0,0,0,0,0,0,0,0,0
13,Apple Inc.,Apple Inc.,09/27/2021,10:53,1,0,2017,Lexus,RX 450h,N Mathilda Ave and Del Rey Ave,...,0,0,0,0,0,0,0,0,0,0
21,Apple Inc.,Apple Inc.,09/26/2023,11:33,1,0,2017,Lexus,RX 450h,I-80 Westbound,...,1,0,0,1,0,0,1,0,0,0
22,Apple Inc.,Apple Inc.,09/29/2023,2:58,0,1,2017,Lexus,RX 450h,US 101S near Capitol Expressway,...,0,0,0,0,0,0,0,0,0,0
25,Argo AI,Argo AI,07/08/2021,09:18,1,0,2019,Ford,Fusion,Stanford Ave/Bowdoin St,...,0,0,0,1,1,0,0,0,0,0
27,Argo AI,Argo AI,9/22/2022,5:24,0,1,2020,Ford,Escape,SE Park Avenue,...,0,0,0,0,0,0,0,0,0,0
28,Argo AI,Argo AI,10/19/2021,2:25,0,1,2020,Ford,Escape,Grant Ave/El Camino Real,...,1,0,0,1,0,0,0,0,0,0
31,"Beep, Inc.","Beep, Inc.",8/23/2023,5:54,0,1,2018,Navya,ARMA,"9th St at Seven Seas Ave, Treasure Island",...,1,0,0,0,0,1,0,0,0,0
34,Cruise LLC,Cruise,01/13/2021,5:32,0,1,2020,Chevrolet,Bolt,201 Grant Av,...,0,0,0,0,0,0,0,0,0,0


In [138]:
# Fresh locations dataframe
collision_locations = auto_collisions_df[geo_only]
collision_locations.head()

Unnamed: 0,Operator,Business,Date_of_Accident,Time_of_accident,AM,PM,location,city,county,state,zip
1,Apollo Autonomous Driving USA LLC,Apollo Autonomous Driving USA LLC,06/29/2023,3:39,0,1,Moraga St near 34th Ave,San Francisco,San Francisco,CA,94122
4,Apple Inc.,Apple Inc.,01/26/2024,11:06,1,0,I-580E and I-238 Junction,Castro Valley,Alameda,CA,94546
13,Apple Inc.,Apple Inc.,09/27/2021,10:53,1,0,N Mathilda Ave and Del Rey Ave,Sunnyvale,Santa Clara,CA,94085
21,Apple Inc.,Apple Inc.,09/26/2023,11:33,1,0,I-80 Westbound,Emeryville,Alameda,CA,94608
22,Apple Inc.,Apple Inc.,09/29/2023,2:58,0,1,US 101S near Capitol Expressway,San Jose,Santa Clara,CA,95121


In [139]:
# Rename columns:
collision_locations.rename(columns=rename_dict, inplace=True)

# Fix date times
collision_locations['Date_of_Accident'] = pd.to_datetime(collision_locations['Date_of_Accident'], errors='coerce')

# Get the initial number of rows
initial_rows = collision_locations.shape[0]

# Drop NaN values in 'Time_of_accident' column with an empty string
collision_locations.dropna(subset=['Time_of_accident'], inplace=True)
# Find additional bad time data
# unique_times = collision_locations['Time_of_accident'].astype(str).unique()
# Sort the unique times and print them
# sorted_unique_times = sorted(unique_times)
# print(sorted_unique_times)
# Drop Zoox 06_08_2022 where no time is entered.
collision_locations = collision_locations[collision_locations['Time_of_accident'] != 0]

# Calculate the number of rows dropped
rows_dropped = initial_rows - collision_locations.shape[0]
print(f"Number of rows dropped: {rows_dropped}")


# Fix Woven Planet Empty fields
collision_locations.loc[collision_locations['Operator'] == 0, ["Operator", "Business"]] = "Woven Planet"

# Fix Ghost Autonomy fields
collision_locations.loc[collision_locations['Business'] == 'same', ["Operator", "Business"]] = "Ghost Autonomy"






Number of rows dropped: 1




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [140]:

# Convert 'Time_of_accident' column to string type
collision_locations['Time_of_accident'] = collision_locations['Time_of_accident'].astype(str)
collision_locations[['hour', 'minute']] = collision_locations['Time_of_accident'].str.extract(r'(\d+):(\d+)')
collision_locations[['hour', 'minute']] = collision_locations[['hour', 'minute']].astype(int)

# Add 12 hours to hour component if PM is True and hour is less than 12
collision_locations['hour'] = collision_locations.apply(lambda row: row['hour'] + 12 if row['PM'] == 1 and row['hour'] < 12 else row['hour'], axis=1)

#Rehoin hours and mins into 24hr time
collision_locations['Time_of_accident_24hr'] = pd.to_datetime(collision_locations[['hour', 'minute']].astype(str).agg(':'.join, axis=1), format='%H:%M').dt.time
# Drop calc fileds
collision_locations.drop(columns=(['Time_of_accident', 'AM', 'PM', 'hour', 'minute']),inplace=True)

collision_locations.head(10)






Unnamed: 0,Operator,Business,Date_of_Accident,location,city,county,state,zip,Time_of_accident_24hr
1,Apollo Autonomous Driving USA LLC,Apollo Autonomous Driving USA LLC,2023-06-29,Moraga St near 34th Ave,San Francisco,San Francisco,CA,94122,15:39:00
4,Apple Inc.,Apple Inc.,2024-01-26,I-580E and I-238 Junction,Castro Valley,Alameda,CA,94546,11:06:00
13,Apple Inc.,Apple Inc.,2021-09-27,N Mathilda Ave and Del Rey Ave,Sunnyvale,Santa Clara,CA,94085,10:53:00
21,Apple Inc.,Apple Inc.,2023-09-26,I-80 Westbound,Emeryville,Alameda,CA,94608,11:33:00
22,Apple Inc.,Apple Inc.,2023-09-29,US 101S near Capitol Expressway,San Jose,Santa Clara,CA,95121,14:58:00
25,Argo AI,Argo AI,2021-07-08,Stanford Ave/Bowdoin St,Palo Alto,Santa Clara,CA,94304,09:18:00
27,Argo AI,Argo AI,2022-09-22,SE Park Avenue,Palo Alto,Santa Clara,CA,94306,17:24:00
28,Argo AI,Argo AI,2021-10-19,Grant Ave/El Camino Real,Palo Alto,Santa Clara,CA,94306,14:25:00
31,"Beep, Inc.","Beep, Inc.",2023-08-23,"9th St at Seven Seas Ave, Treasure Island",San Francisco,San Francisco,CA,94130,17:54:00
34,Cruise LLC,Cruise,2021-01-13,201 Grant Av,San Francisco,San Francisco,CA,94108,17:32:00


In [141]:
# export as csv
# Specify the full path where you want to save the CSV file
csv_file_path = 'data/collisions/dataframes/locations/crash_locations.csv'

# Save DataFrame to CSV
collision_locations.to_csv(csv_file_path, index=False)
collision_locations.tail(10)


Unnamed: 0,Operator,Business,Date_of_Accident,location,city,county,state,zip,Time_of_accident_24hr
552,Zoox,Zoox,2022-11-14,Folsom Street and 8th Street,San Francisco,San Francisco,CA,94103,20:22:00
553,Zoox,Zoox,2022-11-28,11th St. and Harrison St. Intersection,San Francisco,San Francisco,CA,94103,22:18:00
554,Zoox,Zoox,2024-01-01,On Capp Street at 24th Street,San Francisco,San Francisco,CA,94110,21:13:00
555,Zoox,Zoox,2024-01-01,On Capp Street at 24th Street,San Francisco,San Francisco,CA,94110,21:13:00
558,Zoox,Zoox,2024-01-15,On Stockton St near Union St,San Francisco,San Francisco,CA,94133,14:12:00
559,Zoox,Zoox,2024-02-22,On Mission St at 9th St,San Francisco,San Francisco,CA,94103,11:00:00
561,Zoox,Zoox,2023-04-11,Kearny Street and Commercial Street Intersection,San Francisco,San Francisco,CA,94108,11:19:00
562,Zoox Inc.,Zoox Inc.,2023-05-17,13th Street and Harrison Street,San Francisco,San Francisco,CA,94103,11:40:00
564,Zoox,Zoox,2022-06-01,"Pacific Ave., Kearny St., Columbus Ave. Inters...",San Francisco,San Francisco,CA,94113,13:15:00
566,Zoox,Zoox,2023-09-11,At 7th Street and Folsom Street,San Francisco,San Francisco,CA,94103,15:20:00


In [142]:
# Fresh locations dataframe
descriptions_df = auto_collisions_df[description_cols]
descriptions_df.head()

Unnamed: 0,location,In_autonomy,Description
1,Moraga St near 34th Ave,1,"On June 29, 2023 at 3:39 PM PDT, an Apollo Aut..."
4,I-580E and I-238 Junction,1,"On January 26th, a test vehicle operating with..."
13,N Mathilda Ave and Del Rey Ave,1,"A test vehicle, operating in autonomous mode i..."
21,I-80 Westbound,1,"On September 26th, a test vehicle operating wi..."
22,US 101S near Capitol Expressway,1,"On September 29th, a test vehicle operating wi..."


In [143]:
descriptions_df.rename(columns=rename_dict, inplace=True)

descriptions_df.to_csv('data/collisions/dataframes/descriptions/descriptions.csv')




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# @JackP TODO: Collision heatmap and severity

In [None]:
# TODO: Collisions & disengagements grouped bar by mfg (on main)
