# NYC traffic accidents over a 4 year period 
## Filter and Subset

Download <a href="https://www.dropbox.com/s/585wrgl08djzlyt/accidents-nyc.csv?dl=0">this dataset</a> stored on dropbox.

In [5]:
## import necessary libraries
import pandas as pd

In [6]:
## read the dataset into notebook
accidents = pd.read_csv("accidents-nyc.csv")
accidents.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
0,4/13/21,21:35,BROOKLYN,1,0,1,0,0,0,0,0,Unspecified,,4407147,Sedan,
1,4/13/21,16:00,BROOKLYN,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4407811,Sedan,
2,4/13/21,17:30,QUEENS,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,Unspecified,4408019,Sedan,Sedan
3,4/11/21,21:06,BROOKLYN,1,0,1,0,0,0,0,0,Passing Too Closely,,4406488,Taxi,
4,4/15/21,20:00,STATEN ISLAND,0,0,0,0,0,0,0,0,Unspecified,,4408310,Sedan,


In [7]:
## see the overall info about this dataset
accidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282873 entries, 0 to 282872
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   CRASH DATE                     282873 non-null  object
 1   CRASH TIME                     282873 non-null  object
 2   BOROUGH                        282873 non-null  object
 3   NUMBER OF PERSONS INJURED      282873 non-null  int64 
 4   NUMBER OF PERSONS KILLED       282873 non-null  int64 
 5   NUMBER OF PEDESTRIANS INJURED  282873 non-null  int64 
 6   NUMBER OF PEDESTRIANS KILLED   282873 non-null  int64 
 7   NUMBER OF CYCLIST INJURED      282873 non-null  int64 
 8   NUMBER OF CYCLIST KILLED       282873 non-null  int64 
 9   NUMBER OF MOTORIST INJURED     282873 non-null  int64 
 10  NUMBER OF MOTORIST KILLED      282873 non-null  int64 
 11  CONTRIBUTING FACTOR VEHICLE 1  281489 non-null  object
 12  CONTRIBUTING FACTOR VEHICLE 2  224591 non-nu

In [8]:
## create a series of crash dates.
accidents["CRASH DATE"]


0         4/13/21
1         4/13/21
2         4/13/21
3         4/11/21
4         4/15/21
           ...   
282868     1/1/19
282869     1/1/19
282870     1/1/19
282871     1/1/19
282872     1/1/19
Name: CRASH DATE, Length: 282873, dtype: object

In [9]:
## Which borough had the most crashes?
accidents ["BOROUGH"].value_counts().head()

BROOKLYN         95099
QUEENS           80085
BRONX            50123
MANHATTAN        48864
STATEN ISLAND     8702
Name: BOROUGH, dtype: int64

In [10]:
## which type of vehicle was primary vehicle involved in crashes?
## SHOW ONLY THE TOP 7

accidents["VEHICLE TYPE CODE 1"].value_counts().head(7)

Sedan                                  129987
Station Wagon/Sport Utility Vehicle    102850
Taxi                                    10647
Pick-up Truck                            7183
Box Truck                                5504
Bus                                      4697
Bike                                     3177
Name: VEHICLE TYPE CODE 1, dtype: int64

In [11]:
## What were a FIVE unusual primary vehicles to get into a crash?
accidents["VEHICLE TYPE CODE 1"].value_counts().tail(5) 

SLINGSHOT     1
CHEVY EXPR    1
Go kart       1
FDNY Engin    1
MAC T         1
Name: VEHICLE TYPE CODE 1, dtype: int64

In [12]:
## create a subset of data for only Queens
## place it in a dataframe called df_q

df_q = accidents["BOROUGH"] == "QUEENS"
df_q

0         False
1         False
2          True
3         False
4         False
          ...  
282868    False
282869    False
282870    False
282871    False
282872    False
Name: BOROUGH, Length: 282873, dtype: bool

In [23]:
## CHALLENGE (as in you have to google this)
## How many people were killed in Queens in accidents?

my_q_killed1 = accidents["NUMBER OF PERSONS KILLED"] >=1
my_q_killed2 = accidents ["BOROUGH"] == "QUEENS"
killed_queens= accidents [my_q_killed1 & my_q_killed2]

killed_queens['NUMBER OF PERSONS KILLED'].sum()      

120

In [25]:
## Same
## how many cyclists were killed in Queens?

my_q_cy_killed1 = accidents["NUMBER OF CYCLIST KILLED"] >=1
my_q_cy_killed2 = accidents ["BOROUGH"] == "QUEENS"
cy_killed_queens = accidents [my_q_cy_killed1 & my_q_cy_killed2]

cy_killed_queens['NUMBER OF CYCLIST KILLED'].sum()

8

In [31]:
## Filter and subset 
## create a dataset for Manhattan that involved taxi cabs as the primary vehicle cause

manhattan1 = accidents["BOROUGH"] == "MANHATTAN"
manhattan2 = accidents ["VEHICLE TYPE CODE 1"] == "Taxi"

taxi_manhattan = accidents [manhattan1 & manhattan2]
taxi_manhattan


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
67,4/14/21,8:03,MANHATTAN,1,0,1,0,0,0,0,0,Driver Inattention/Distraction,,4407277,Taxi,
144,4/14/21,0:42,MANHATTAN,0,0,0,0,0,0,0,0,Unspecified,Unspecified,4407278,Taxi,Sedan
159,4/16/21,19:54,MANHATTAN,0,0,0,0,0,0,0,0,Unspecified,,4407959,Taxi,
283,4/16/21,21:04,MANHATTAN,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,,4408288,Taxi,
326,4/16/21,16:15,MANHATTAN,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,Unspecified,4408069,Taxi,Bus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282632,1/1/19,1:30,MANHATTAN,0,0,0,0,0,0,0,0,Other Vehicular,Driver Inattention/Distraction,4060445,Taxi,Station Wagon/Sport Utility Vehicle
282684,1/1/19,16:00,MANHATTAN,2,0,0,0,0,0,2,0,Traffic Control Disregarded,Unspecified,4061524,Taxi,Station Wagon/Sport Utility Vehicle
282802,1/1/19,16:15,MANHATTAN,0,0,0,0,0,0,0,0,Passenger Distraction,Passing Too Closely,4060796,Taxi,Sedan
282819,1/1/19,20:30,MANHATTAN,0,0,0,0,0,0,0,0,Unspecified,,4060662,Taxi,


In [33]:
accidents[accidents["BOROUGH"]=="MANHATTAN"]
accidents[accidents["VEHICLE TYPE CODE 1"]=="Taxi"]

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
3,4/11/21,21:06,BROOKLYN,1,0,1,0,0,0,0,0,Passing Too Closely,,4406488,Taxi,
67,4/14/21,8:03,MANHATTAN,1,0,1,0,0,0,0,0,Driver Inattention/Distraction,,4407277,Taxi,
128,4/16/21,15:21,BROOKLYN,0,0,0,0,0,0,0,0,Unspecified,Unspecified,4407997,Taxi,
139,4/14/21,10:40,BROOKLYN,1,0,0,0,0,0,1,0,Failure to Yield Right-of-Way,Unsafe Speed,4407548,Taxi,Sedan
144,4/14/21,0:42,MANHATTAN,0,0,0,0,0,0,0,0,Unspecified,Unspecified,4407278,Taxi,Sedan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282684,1/1/19,16:00,MANHATTAN,2,0,0,0,0,0,2,0,Traffic Control Disregarded,Unspecified,4061524,Taxi,Station Wagon/Sport Utility Vehicle
282796,1/1/19,3:50,BROOKLYN,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4060770,Taxi,Sedan
282802,1/1/19,16:15,MANHATTAN,0,0,0,0,0,0,0,0,Passenger Distraction,Passing Too Closely,4060796,Taxi,Sedan
282819,1/1/19,20:30,MANHATTAN,0,0,0,0,0,0,0,0,Unspecified,,4060662,Taxi,


In [34]:
## What were the top 5 causes of accidents across all the boroughs?
## by primary vehicle cause

accidents["CONTRIBUTING FACTOR VEHICLE 1"].value_counts().head()


Unspecified                       78494
Driver Inattention/Distraction    70615
Failure to Yield Right-of-Way     20691
Following Too Closely             14407
Backing Unsafely                  13348
Name: CONTRIBUTING FACTOR VEHICLE 1, dtype: int64

In [35]:
## What were the top 5 causes of accidents across all the boroughs?
## by secondary vehicle cause

accidents["CONTRIBUTING FACTOR VEHICLE 2"].value_counts().head()

Unspecified                       190456
Driver Inattention/Distraction     14186
Other Vehicular                     3529
Failure to Yield Right-of-Way       2233
Passing or Lane Usage Improper      2171
Name: CONTRIBUTING FACTOR VEHICLE 2, dtype: int64

In [46]:
## What were the 5 RAREST causes for primary vehicles causing the accident
rare_accidents= accidents["CONTRIBUTING FACTOR VEHICLE 1"].value_counts().tail()
rare_accidents


Shoulders Defective/Improper    13
Texting                          8
Cell Phone (hands-free)          8
Windshield Inadequate            3
Listening/Using Headphones       2
Name: CONTRIBUTING FACTOR VEHICLE 1, dtype: int64

In [62]:
#THIS IS A PERSONAL NOTE, NOT INCLUDED IN THE HOMEWORK
# This is a LIST of the CONTRIBUTING FACTOR VEHICLE 1

list_single_causes = accidents["CONTRIBUTING FACTOR VEHICLE 1"]
list_single_causes



0                            Unspecified
1                  Following Too Closely
2         Driver Inattention/Distraction
3                    Passing Too Closely
4                            Unspecified
                       ...              
282868    Driver Inattention/Distraction
282869                  Steering Failure
282870                       Unspecified
282871               Passing Too Closely
282872    Driver Inattention/Distraction
Name: CONTRIBUTING FACTOR VEHICLE 1, Length: 282873, dtype: object

In [61]:
## list ALL the causes as unique values (in other words, create a list of the causes)
## WHAT ARE SOME UNUSUAL REASONS FOR ACCIDENTS?

# A set is a list that only accepts unique values 

single_causes = set()
for a in accidents["CONTRIBUTING FACTOR VEHICLE 1"]:
    single_causes.add (a)
    
single_causes

{'Accelerator Defective',
 'Aggressive Driving/Road Rage',
 'Alcohol Involvement',
 'Animals Action',
 'Backing Unsafely',
 'Brakes Defective',
 'Cell Phone (hand-Held)',
 'Cell Phone (hands-free)',
 'Driver Inattention/Distraction',
 'Driver Inexperience',
 'Driverless/Runaway Vehicle',
 'Drugs (illegal)',
 'Eating or Drinking',
 'Failure to Keep Right',
 'Failure to Yield Right-of-Way',
 'Fatigued/Drowsy',
 'Fell Asleep',
 'Following Too Closely',
 'Glare',
 'Headlights Defective',
 'Illnes',
 'Lane Marking Improper/Inadequate',
 'Listening/Using Headphones',
 'Lost Consciousness',
 'Obstruction/Debris',
 'Other Electronic Device',
 'Other Lighting Defects',
 'Other Vehicular',
 'Outside Car Distraction',
 'Oversized Vehicle',
 'Passenger Distraction',
 'Passing Too Closely',
 'Passing or Lane Usage Improper',
 'Pavement Defective',
 'Pavement Slippery',
 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
 'Physical Disability',
 'Prescription Medication',
 'Reaction to Uninvol