# Analysis of Deaths Involving Police in the US

This notebook analyzes police-related fatalities in the United States since January 1, 2015, 
and explores relationships with various US Census demographic data (poverty rate, high school 
graduation rate, median household income, and racial demographics).

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format
sns.set(style='whitegrid')

In [None]:
# Load data
df_hh_income = pd.read_csv('Median_Household_Income_2015.csv', encoding="windows-1252")
df_pct_poverty = pd.read_csv('Pct_People_Below_Poverty_Level.csv', encoding="windows-1252")
df_pct_completed_hs = pd.read_csv('Pct_Over_25_Completed_High_School.csv', encoding="windows-1252")
df_share_race_city = pd.read_csv('Share_of_Race_By_City.csv', encoding="windows-1252")
df_fatalities = pd.read_csv('Deaths_by_Police_US.csv', encoding="windows-1252")

In [None]:
# Preliminary Data Exploration
print("Shapes of DataFrames:")
print("Median Household Income:", df_hh_income.shape)
print("Poverty Rate:", df_pct_poverty.shape)
print("High School Completion Rate:", df_pct_completed_hs.shape)
print("Share of Race by City:", df_share_race_city.shape)
print("Police Fatalities:", df_fatalities.shape)

print("\nHead of Median Household Income:")
print(df_hh_income.head())
print("\nHead of Police Fatalities:")
print(df_fatalities.head())

# Check for missing values
print("\nMissing values in each dataset:")
print("Income nulls:\n", df_hh_income.isnull().sum())
print("Poverty nulls:\n", df_pct_poverty.isnull().sum())
print("HS Completion nulls:\n", df_pct_completed_hs.isnull().sum())
print("Race Share nulls:\n", df_share_race_city.isnull().sum())
print("Fatalities nulls:\n", df_fatalities.isnull().sum())

# Check for duplicates
print("\nDuplicate entries:")
print("Income duplicates:", df_hh_income.duplicated().sum())
print("Poverty duplicates:", df_pct_poverty.duplicated().sum())
print("HS Completion duplicates:", df_pct_completed_hs.duplicated().sum())
print("Race Share duplicates:", df_share_race_city.duplicated().sum())
print("Fatalities duplicates:", df_fatalities.duplicated().sum())

In [None]:
# Data Cleaning - Check for Missing Values and Duplicates
# Fill missing values in census datasets with 0
for df in [df_hh_income, df_pct_poverty, df_pct_completed_hs, df_share_race_city]:
    df.fillna(0, inplace=True)

# Drop duplicates if any
df_hh_income.drop_duplicates(inplace=True)
df_pct_poverty.drop_duplicates(inplace=True)
df_pct_completed_hs.drop_duplicates(inplace=True)
df_share_race_city.drop_duplicates(inplace=True)
df_fatalities.drop_duplicates(inplace=True)

# Standardize city names in census data: remove suffixes like ' city', ' town', ' CDP'
def clean_city_name(name):
    # Lowercase and strip common suffixes
    name = str(name).lower()
    for suffix in [' city', ' town', ' cdp', ' village', ' municipality']:
        if name.endswith(suffix):
            name = name.replace(suffix, '')
    return name.strip()

# Apply cleaning to census city names
df_hh_income['city_clean'] = df_hh_income['City'].apply(clean_city_name)
df_pct_poverty['city_clean'] = df_pct_poverty['City'].apply(clean_city_name)
df_pct_completed_hs['city_clean'] = df_pct_completed_hs['City'].apply(clean_city_name)
df_share_race_city['city_clean'] = df_share_race_city['City'].apply(clean_city_name)

# Lowercase state codes in census files
df_hh_income['state'] = df_hh_income['Geographic Area'].str.lower()
df_pct_poverty['state'] = df_pct_poverty['Geographic Area'].str.lower()
df_pct_completed_hs['state'] = df_pct_completed_hs['Geographic Area'].str.lower()
df_share_race_city['state'] = df_share_race_city['Geographic area'].str.lower()

# Clean city names in fatalities
df_fatalities['city_clean'] = df_fatalities['city'].str.lower()

In [None]:
# Chart the Poverty Rate in each US State
state_poverty = df_pct_poverty.groupby('Geographic Area')['poverty_rate'].mean().reset_index()
state_poverty_sorted = state_poverty.sort_values('poverty_rate', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=state_poverty_sorted, x='poverty_rate', y='Geographic Area', palette='viridis')
plt.xlabel('Poverty Rate (%)')
plt.ylabel('State')
plt.title('Poverty Rate by State (2015)')
plt.tight_layout()
plt.show()

In [None]:
# Chart the High School Graduation Rate by US State
state_hs = df_pct_completed_hs.groupby('Geographic Area')['percent_completed_hs'].mean().reset_index()
state_hs_sorted = state_hs.sort_values('percent_completed_hs', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=state_hs_sorted, x='percent_completed_hs', y='Geographic Area', palette='magma')
plt.xlabel('High School Graduation Rate (%)')
plt.ylabel('State')
plt.title('High School Graduation Rate by State (2015)')
plt.tight_layout()
plt.show()

In [None]:
# Visualise the Relationship between Poverty Rates and High School Graduation Rates
# Merge state-level poverty and HS datasets
state_merged = pd.merge(state_poverty, state_hs, on='Geographic Area')
state_merged.sort_values('poverty_rate', inplace=True)

fig, ax1 = plt.subplots(figsize=(12, 6))
ax2 = ax1.twinx()
ax1.plot(state_merged['Geographic Area'], state_merged['poverty_rate'], color='red', marker='o', label='Poverty Rate')
ax2.plot(state_merged['Geographic Area'], state_merged['percent_completed_hs'], color='blue', marker='s', label='HS Graduation Rate')

ax1.set_xlabel('State')
ax1.set_ylabel('Poverty Rate (%)', color='red')
ax2.set_ylabel('HS Graduation Rate (%)', color='blue')
plt.xticks(rotation=90)
plt.title('Poverty Rate vs. HS Graduation Rate by State')
fig.tight_layout()
plt.show()

# Jointplot and Regression Plot between Poverty and HS Graduation
sns.jointplot(data=state_merged, x='poverty_rate', y='percent_completed_hs', kind='scatter', height=8)
plt.suptitle('Scatter Plot of Poverty Rate vs HS Graduation Rate', y=1.02)
plt.show()

sns.jointplot(data=state_merged, x='poverty_rate', y='percent_completed_hs', kind='kde', height=8, shade=True)
plt.suptitle('KDE Plot of Poverty Rate vs HS Graduation Rate', y=1.02)
plt.show()

sns.lmplot(data=state_merged, x='poverty_rate', y='percent_completed_hs', height=6)
plt.title('Regression of HS Graduation Rate on Poverty Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create a Bar Chart with Subsections Showing the Racial Makeup of Each US State
race_cols = ['share_white', 'share_black', 'share_native_american', 'share_asian', 'share_hispanic']
state_race = df_share_race_city.groupby('Geographic area')[race_cols].mean().reset_index()
state_race_sorted = state_race.sort_values('share_white', ascending=False)

# For clarity, show a stacked bar chart of the top 10 states by white share
top_states = state_race_sorted.head(10)['Geographic area'].tolist()
state_race_top = state_race[state_race['Geographic area'].isin(top_states)]
state_race_top.set_index('Geographic area', inplace=True)

state_race_top[race_cols].plot(kind='bar', stacked=True, figsize=(12, 8))
plt.xlabel('State')
plt.ylabel('Population Share (%)')
plt.title('Racial Makeup by State (Top 10 by White Share)')
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Create Donut Chart of People Killed by Race
race_counts = df_fatalities['race'].value_counts()
plt.figure(figsize=(8, 8))
colors = sns.color_palette('pastel')[0:len(race_counts)]
plt.pie(
    race_counts,
    labels=race_counts.index,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'width': 0.4}
)
plt.title('Distribution of Police Killings by Race')
plt.show()

In [None]:
# Create a Chart Comparing the Total Number of Deaths of Men and Women
gender_counts = df_fatalities['gender'].value_counts()
plt.figure(figsize=(6, 6))
sns.barplot(x=gender_counts.index, y=gender_counts.values, palette='coolwarm')
plt.xlabel('Gender')
plt.ylabel('Number of Fatalities')
plt.title('Number of Fatalities by Gender')
plt.tight_layout()
plt.show()

In [None]:
# Create a Box Plot Showing the Age and Manner of Death
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_fatalities, x='manner_of_death', y='age', hue='gender')
plt.xlabel('Manner of Death')
plt.ylabel('Age')
plt.title('Distribution of Age by Manner of Death and Gender')
plt.tight_layout()
plt.show()

In [None]:
# Were People Armed?
armed_counts = df_fatalities['armed'].value_counts(normalize=True) * 100
plt.figure(figsize=(10, 6))
sns.barplot(x=armed_counts.index, y=armed_counts.values, palette='Set2')
plt.xlabel('Armed Status')
plt.ylabel('Percentage of Fatalities')
plt.title('Percentage of Fatalities by Armed Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# How Old Were the People Killed?
# Percentage under 25
under_25 = df_fatalities[df_fatalities['age'] < 25]
pct_under_25 = len(under_25) / len(df_fatalities) * 100
print(f"Percentage of fatalities under 25: {pct_under_25:.2f}%")

# Histogram and KDE of Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_fatalities['age'].dropna(), kde=True, bins=30, color='purple')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution of Police Fatalities')
plt.tight_layout()
plt.show()

# KDE plot for each race
plt.figure(figsize=(10, 6))
for race in df_fatalities['race'].unique():
    subset = df_fatalities[df_fatalities['race'] == race]
    sns.kdeplot(subset['age'].dropna(), label=race, shade=False)
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Age Distribution by Race')
plt.legend(title='Race')
plt.tight_layout()
plt.show()

In [None]:
# Race of People Killed
plt.figure(figsize=(8, 6))
race_counts = df_fatalities['race'].value_counts()
sns.barplot(x=race_counts.index, y=race_counts.values, palette='Set1')
plt.xlabel('Race')
plt.ylabel('Number of Fatalities')
plt.title('Number of Fatalities by Race')
plt.tight_layout()
plt.show()

In [None]:
# Mental Illness and Police Killings
mental_counts = df_fatalities['signs_of_mental_illness'].value_counts(normalize=True) * 100
print("\nPercentage of fatalities with signs of mental illness:")
print(mental_counts)

In [None]:
# In Which Cities Do the Most Police Killings Take Place?
# Combine city and state for a unique identifier
df_fatalities['city_state'] = df_fatalities['city'] + ', ' + df_fatalities['state']
city_counts = df_fatalities['city_state'].value_counts().reset_index()
city_counts.columns = ['city_state', 'count']
top10_cities = city_counts.head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=top10_cities, x='count', y='city_state', palette='autumn')
plt.xlabel('Number of Fatalities')
plt.ylabel('City, State')
plt.title('Top 10 Cities by Police Fatalities')
plt.tight_layout()
plt.show()

In [None]:
# Rate of Death by Race in Top 10 Cities
df_top_cities = df_fatalities[df_fatalities['city_state'].isin(top10_cities['city_state'])]
# Calculate share by race for each city
share_by_race_city = (
    df_top_cities
    .groupby(['city_state', 'race'])
    .size()
    .groupby(level=0)
    .apply(lambda x: 100 * x / float(x.sum()))
    .reset_index(name='percentage')
)

# Create a barplot for each city
g = sns.catplot(
    data=share_by_race_city,
    x='city_state',
    y='percentage',
    hue='race',
    kind='bar',
    height=8,
    aspect=1.5
)
g.set_xticklabels(rotation=90)
g.set_axis_labels("City, State", "Percentage")
g.fig.suptitle("Share of Fatalities by Race in Top 10 Cities")
plt.tight_layout()
plt.show()

In [None]:
# Create a Choropleth Map of Police Killings by US State
state_counts = df_fatalities['state'].value_counts().reset_index()
state_counts.columns = ['state', 'count']
# Convert state codes to uppercase for Plotly
state_counts['state'] = state_counts['state'].str.upper()

fig = px.choropleth(
    state_counts,
    locations='state',
    locationmode='USA-states',
    color='count',
    scope='usa',
    title='Number of Police Fatalities by State'
)
fig.show()

In [None]:
# Number of Police Killings Over Time
df_fatalities['date_parsed'] = pd.to_datetime(df_fatalities['date'], format='%m/%d/%y')
fatalities_over_time = df_fatalities.groupby('date_parsed').size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.lineplot(data=fatalities_over_time, x='date_parsed', y='count', marker='o')
plt.xlabel('Date')
plt.ylabel('Number of Fatalities')
plt.title('Number of Police Fatalities Over Time')
plt.tight_layout()
plt.show()