In [375]:
import pandas as pd
import numpy as np
from scipy import stats

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt

In [376]:
df = pd.read_excel('data.xlsx')
df.head(1)

Unnamed: 0,Victim's name,Victim's age,Victim's gender,Victim's race,URL of image of victim,Date of Incident (month/day/year),Street Address of Incident,City,State,Zipcode,...,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48
0,Kalon Horton,29,Male,Black,,2021-05-09,,Lancaster,TX,,...,,,,,,,,,,


## First Steps of Cleaning:
- Columns 35 - 48 are unnamed and full of nulls: remove columns
- Column names not adequate for python exploration: rename remaining columns


In [377]:
df.drop(columns=(list(df.columns[35:])), inplace=True)

In [378]:
list(df.columns)

["Victim's name",
 "Victim's age",
 "Victim's gender",
 "Victim's race",
 'URL of image of victim',
 'Date of Incident (month/day/year)',
 'Street Address of Incident',
 'City',
 'State',
 'Zipcode',
 'County',
 'Agency responsible for death',
 'ORI Agency Identifier (if available)',
 'Cause of death',
 'A brief description of the circumstances surrounding the death',
 'Official disposition of death (justified or other)',
 'Criminal Charges?',
 'Link to news article or photo of official document',
 'Symptoms of mental illness?',
 'Armed/Unarmed Status',
 'Alleged Weapon (Source: WaPo and Review of Cases Not Included in WaPo Database)',
 'Alleged Threat Level (Source: WaPo)',
 'Fleeing (Source: WaPo)',
 'Body Camera (Source: WaPo)',
 'WaPo ID (If included in WaPo database)',
 'Off-Duty Killing?',
 'Geography (via Trulia methodology based on zipcode population density: http://jedkolko.com/wp-content/uploads/2015/05/full-ZCTA-urban-suburban-rural-classification.xlsx )',
 'MPV ID',
 'Fatal

In [379]:
columns = ["name",
 "age",
 "gender",
 "race",
 'img_url',
 'date',
 'address',
 'city',
 'state',
 'zipcode',
 'county',
 'agency_responsible',
 'ori_agency_identifier',
 'cause_of_death',
 'description_of_circumstances',
 'official_disposition',
 'criminal_charges_filed',
 'news_article_or_photo_of_official_document',
 'mental_illness',
 'armed_unarmed_status',
 'alleged_weapon',
 'alleged_threat_lvl',
 'fleeing',
 'body_camera',
 'wapo_id',
 'off_duty_killing',
 'geography',
 'mpv_id',
 'fatal_encounters_id',
 'encounter_type_draft',
 'initial_reported_reason_for_encounter_draft',
 'names_of_officers_involved_draft',
 'race_of_officers_involved_draft',
 'known_past_shootings_of_Officer_draft',
 'call_for_service_draft']


In [380]:
df.columns = columns

In [381]:
df.head()

Unnamed: 0,name,age,gender,race,img_url,date,address,city,state,zipcode,...,off_duty_killing,geography,mpv_id,fatal_encounters_id,encounter_type_draft,initial_reported_reason_for_encounter_draft,names_of_officers_involved_draft,race_of_officers_involved_draft,known_past_shootings_of_Officer_draft,call_for_service_draft
0,Kalon Horton,29.0,Male,Black,,2021-05-09,,Lancaster,TX,,...,,,,,Part 1 Violent Crime,gunshots,,,,Yes
1,Name withheld by police,,Male,Unknown Race,,2021-05-09,90 S. Main St.,Leicester,MA,1524.0,...,,Suburban,,,Other,deliberate car crash,,,,Yes
2,Jeffrey Mark Murray,62.0,Male,Unknown Race,,2021-05-09,,Greenville,SC,,...,,,,,Part 1 Violent Crime,murder,,,,Yes
3,Everton Brown,56.0,Male,Unknown Race,,2021-05-08,,Baltimore,MD,,...,,,,,Part 1 Violent Crime,murder,,,,Yes
4,Felix Jerry Marquez,34.0,Male,Hispanic,,2021-05-08,,Riverside,CA,,...,,,,,Person with a gun,person with a gun,,,,No


In [382]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9147 entries, 0 to 9146
Data columns (total 35 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   name                                         9147 non-null   object        
 1   age                                          9138 non-null   object        
 2   gender                                       9137 non-null   object        
 3   race                                         9147 non-null   object        
 4   img_url                                      4882 non-null   object        
 5   date                                         9147 non-null   datetime64[ns]
 6   address                                      9042 non-null   object        
 7   city                                         9138 non-null   object        
 8   state                                        9147 non-null   object        
 9

## Next obvious data types:
- age is object: convert to int 
    - inspect age value counts... why null?

In [383]:
df[df.age.isnull() == True]

Unnamed: 0,name,age,gender,race,img_url,date,address,city,state,zipcode,...,off_duty_killing,geography,mpv_id,fatal_encounters_id,encounter_type_draft,initial_reported_reason_for_encounter_draft,names_of_officers_involved_draft,race_of_officers_involved_draft,known_past_shootings_of_Officer_draft,call_for_service_draft
1,Name withheld by police,,Male,Unknown Race,,2021-05-09,90 S. Main St.,Leicester,MA,1524.0,...,,Suburban,,,Other,deliberate car crash,,,,Yes
53,Name withheld by police,,Male,Unknown Race,,2021-04-17,1100 McVicar Ave.,Kingman,AZ,86409.0,...,,Rural,,30061.0,Traffic Stop,traffic stop,,,,No
313,Harmony Wolfgram,,Female,Unknown Race,,2021-01-26,E. 25th Ave. and Imboden Rd.,Aurora,CO,80137.0,...,,Rural,8929.0,29530.0,Other Non-Violent Offense,Suspect (stolen vehicle),,,,No
320,Steven Verdone,,Male,White,https://fatalencounters.org/wp-content/uploads...,2021-01-22,6241 W. Cardinal St.,Homosassa,FL,34446.0,...,,Suburban,8933.0,29525.0,Mental Health/Welfare Check,Erratic behavior,,,,Yes
375,Amanda Faulkner,,Female,White,,2021-01-04,1400 Mooney Rd.,Columbiana,AL,35051.0,...,,Rural,8988.0,29432.0,Domestic Disturbance,Domestic Disturbance,,,,Yes
383,Rodolfo Caraballo Moreno,,Male,Hispanic,,2020-12-31,Southwest 38th Avenue and 28th Street,Miami,FL,33133.0,...,,Urban,8996.0,,Part 1 Violent Crime,Shooting,,,,Yes
409,John Moreno,,Male,Hispanic,,2020-12-25,1030 Abanico Court,Rio Rico,AZ,85648.0,...,,Suburban,9019.0,29370.0,Part 1 Violent Crime,"Home intrusion, vehicular assault, aggravated ...",,,,Yes
516,Name withheld by police,,Male,Black,,2020-11-15,Van Wick St. and Van Ness Ave.,Inglewood,CA,90303.0,...,,Urban,9052.0,29175.0,Person with a Weapon (gun),"Person with a gun, erratic behavior",,,,Yes
525,Tracey Leon McKinney,,Male,Black,,2020-11-13,Glover Ave and Adams Ave,Gulfport,MS,39507.0,...,,Suburban,9053.0,29169.0,Part 1 Violent Crime,Shooting,,,,Yes


In [390]:
# Harmony Wolfgram was 41 according to Westword News
df.loc[313,'age'] = 41

In [391]:
# Rodolfo Caraballo Moreno was 57 according to his obituary
df.loc[383,'age'] = 57

In [392]:
# John Moreno was 32 according to Nogales International
df.loc[409, 'age'] = 32

In [393]:
# Tracey Leon McKinney was 41 according to his obituary
df.loc[525, 'age'] = 41

In [396]:
df[df.age.isnull() == True]

Unnamed: 0,name,age,gender,race,img_url,date,address,city,state,zipcode,...,off_duty_killing,geography,mpv_id,fatal_encounters_id,encounter_type_draft,initial_reported_reason_for_encounter_draft,names_of_officers_involved_draft,race_of_officers_involved_draft,known_past_shootings_of_Officer_draft,call_for_service_draft
1,Name withheld by police,,Male,Unknown Race,,2021-05-09,90 S. Main St.,Leicester,MA,1524.0,...,,Suburban,,,Other,deliberate car crash,,,,Yes
53,Name withheld by police,,Male,Unknown Race,,2021-04-17,1100 McVicar Ave.,Kingman,AZ,86409.0,...,,Rural,,30061.0,Traffic Stop,traffic stop,,,,No
320,Steven Verdone,,Male,White,https://fatalencounters.org/wp-content/uploads...,2021-01-22,6241 W. Cardinal St.,Homosassa,FL,34446.0,...,,Suburban,8933.0,29525.0,Mental Health/Welfare Check,Erratic behavior,,,,Yes
375,Amanda Faulkner,,Female,White,,2021-01-04,1400 Mooney Rd.,Columbiana,AL,35051.0,...,,Rural,8988.0,29432.0,Domestic Disturbance,Domestic Disturbance,,,,Yes
516,Name withheld by police,,Male,Black,,2020-11-15,Van Wick St. and Van Ness Ave.,Inglewood,CA,90303.0,...,,Urban,9052.0,29175.0,Person with a Weapon (gun),"Person with a gun, erratic behavior",,,,Yes


In [403]:
# Leicester D.A. Released the name and age of the victim of this incident
df.loc[1, 'name'] = 'Zachary Richardson'
df.loc[1, 'age'] = 24

In [404]:
# Kingman PD released the name and age of 29–year old Kingman resident, Bradley Michael Rose.
df.loc[53, 'name'] = 'Bradley Michael Rose'
df.loc[53, 'age'] = 29

In [405]:
df[df.age.isnull() == True]

Unnamed: 0,name,age,gender,race,img_url,date,address,city,state,zipcode,...,off_duty_killing,geography,mpv_id,fatal_encounters_id,encounter_type_draft,initial_reported_reason_for_encounter_draft,names_of_officers_involved_draft,race_of_officers_involved_draft,known_past_shootings_of_Officer_draft,call_for_service_draft
320,Steven Verdone,,Male,White,https://fatalencounters.org/wp-content/uploads...,2021-01-22,6241 W. Cardinal St.,Homosassa,FL,34446.0,...,,Suburban,8933.0,29525.0,Mental Health/Welfare Check,Erratic behavior,,,,Yes
375,Amanda Faulkner,,Female,White,,2021-01-04,1400 Mooney Rd.,Columbiana,AL,35051.0,...,,Rural,8988.0,29432.0,Domestic Disturbance,Domestic Disturbance,,,,Yes
516,Name withheld by police,,Male,Black,,2020-11-15,Van Wick St. and Van Ness Ave.,Inglewood,CA,90303.0,...,,Urban,9052.0,29175.0,Person with a Weapon (gun),"Person with a gun, erratic behavior",,,,Yes


In [406]:
df.loc[320, 'age'] = 57

In [407]:
# Amanda Falkner was 48 years old
df.loc[375, 'age'] = 48

In [408]:
df[df.age.isnull() == True]

Unnamed: 0,name,age,gender,race,img_url,date,address,city,state,zipcode,...,off_duty_killing,geography,mpv_id,fatal_encounters_id,encounter_type_draft,initial_reported_reason_for_encounter_draft,names_of_officers_involved_draft,race_of_officers_involved_draft,known_past_shootings_of_Officer_draft,call_for_service_draft
516,Name withheld by police,,Male,Black,,2020-11-15,Van Wick St. and Van Ness Ave.,Inglewood,CA,90303.0,...,,Urban,9052.0,29175.0,Person with a Weapon (gun),"Person with a gun, erratic behavior",,,,Yes


In [409]:
# Name not released but age is 40-45 will make age 43
df.loc[516, 'age'] = 43

## Next steps:
- explore value counts of features to potentially clear nulls, combine values, etc.
- 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=258e71fc-cf2b-48c3-8461-70ecd9787aa1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>