Q3: For the visualization in Q1, we would like you to be able to select a state, and show the
detailed information on the counties of the state.

Ideas: Add the functionality that selecting a state prompts a side view with the counties of the state. This view should be a map with the counties highlighted, and with corresponding tooltips with information in them about the county in question.

In [1]:
import altair as alt
import pandas as pd
from tqdm import tqdm

# Load the data
data_shootings = pd.read_csv('data/gun_violence_processed.csv')

state_data = pd.read_csv('data/state_data.csv').rename(columns={'state': 'State'})
state_counties_data = pd.read_csv('data/State and County FIPS Codes.csv')

# Add extra state information to the shootings data
data_shootings = data_shootings.merge(state_data, on='State')

data_shootings['Incident Date'] = pd.to_datetime(data_shootings['Incident Date'])
data_shootings['Year'] = data_shootings['Incident Date'].dt.year
data_shootings['Month'] = data_shootings['Incident Date'].dt.month

data_shootings.loc[:,'count'] = 1  # Add a count column for aggregation

In [2]:
county_df = data_shootings.groupby(['FIPS', 'Year']).agg({
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_county_2023': 'first',
    'count': 'sum'
}).rename(columns={
    'Victims Killed': 'Total Victims Killed',
    'Victims Injured': 'Total Victims Injured',
    'Population_per_county_2023': 'Population',
    'count': 'Total Incidents'
}).reset_index()

all_years = county_df[['Year']].drop_duplicates().sort_values('Year').reset_index(drop=True)

all_counties_years_df = pd.merge(state_counties_data, all_years, how='cross').rename(columns={'fips': 'FIPS'})

county_df['Count per 100k'] = county_df['Total Incidents'] / county_df['Population'] * 100000

# Merge the data to include all years and all counties
county_df = county_df.merge(all_counties_years_df, on=['FIPS', 'Year'], how='right').fillna(0)

county_df = county_df.rename(columns={'name': 'County'})

county_df

Unnamed: 0,FIPS,Year,Total Victims Killed,Total Victims Injured,Population,Total Incidents,Count per 100k,County,state
0,0,2014,0.0,0.0,0.0,0.0,0.0,UNITED STATES,0
1,0,2015,0.0,0.0,0.0,0.0,0.0,UNITED STATES,0
2,0,2016,0.0,0.0,0.0,0.0,0.0,UNITED STATES,0
3,0,2017,0.0,0.0,0.0,0.0,0.0,UNITED STATES,0
4,0,2018,0.0,0.0,0.0,0.0,0.0,UNITED STATES,0
...,...,...,...,...,...,...,...,...,...
31955,56045,2019,0.0,0.0,0.0,0.0,0.0,Weston County,WY
31956,56045,2020,0.0,0.0,0.0,0.0,0.0,Weston County,WY
31957,56045,2021,0.0,0.0,0.0,0.0,0.0,Weston County,WY
31958,56045,2022,0.0,0.0,0.0,0.0,0.0,Weston County,WY


In [3]:
# group by state and month
state_df = data_shootings.groupby(['State', 'Year']).agg({
    'region': 'first',
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_state_2023': 'first',
    'count': 'sum',
    'FIPS': 'first'
}).rename(columns={
    'region': 'Region',
    'Victims Killed': 'Total Victims Killed',
    'Victims Injured': 'Total Victims Injured',
    'Population_per_state_2023': 'Population',
    'count': 'Total Incidents'
}).reset_index()

# fill empty years with 0
all_states = state_df['State'].unique()

for state in all_states:
    pop = state_df[state_df['State'] == state]['Population'].values[0]
    region = state_df[state_df['State'] == state]['Region'].values[0]
    fips = state_df[state_df['State'] == state]['FIPS'].values[0]
    current_years = state_df[state_df['State'] == state]['Year'].values
    for year in all_years:
        if year not in current_years:
            state_df = pd.concat([state_df, pd.DataFrame([{
                'State': state,
                'Year': year,
                'Region': region,
                'Total Victims Killed': 0,
                'Total Victims Injured': 0,
                'Population': pop,
                'Total Incidents': 0,
                'FIPS': fips
            }])], ignore_index=True)
        
# dataset used for the choropleth map
state_statistics_df = state_df.groupby(['State']).agg({
    'Region': 'first',
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'first',
    'Total Incidents': 'sum',
    'FIPS': 'first'
}).reset_index()

county_statistics_df = county_df.groupby(['FIPS']).agg({
    'County': 'first',
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'first',
    'Total Incidents': 'sum'
}).reset_index()


region_df = state_df.groupby(['Region', 'Year']).agg({
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'sum',
    'Total Incidents': 'sum'
}).reset_index()

state_df["Type"] = "State"
region_df["Type"] = "Region"

state_df.rename(columns={"State": "StateOrRegion"}, inplace=True)
region_df.rename(columns={"Region": "StateOrRegion"}, inplace=True)


state_region_df = pd.concat([state_df, region_df])

state_region_df['Count per 100k'] = state_region_df['Total Incidents'] / state_region_df['Population'] * 100_000


state_statistics_df['StateFIPS'] = state_statistics_df['FIPS'].astype(str).str[:-3].astype(int)

state_statistics_df['Count per 100k'] = state_statistics_df['Total Incidents'] / state_statistics_df['Population'] * 100_000


# TODO: drop useless columns
state_region_df

Unnamed: 0,StateOrRegion,Year,Region,Total Victims Killed,Total Victims Injured,Population,Total Incidents,FIPS,Type,Count per 100k
0,Alabama,2014,Southeast,1,9,5108468,2,1039.0,State,0.039151
1,Alabama,2015,Southeast,4,13,5108468,4,1097.0,State,0.078301
2,Alabama,2016,Southeast,15,53,5108468,15,1045.0,State,0.293630
3,Alabama,2017,Southeast,6,38,5108468,7,1109.0,State,0.137027
4,Alabama,2018,Southeast,11,53,5108468,14,1091.0,State,0.274055
...,...,...,...,...,...,...,...,...,...,...
50,Southwest,2020,,41,154,44102840,42,,Region,0.095232
51,Southwest,2021,,81,260,44102840,65,,Region,0.147383
52,Southwest,2022,,120,301,44102840,75,,Region,0.170057
53,Southwest,2023,,109,332,44102840,84,,Region,0.190464


In [4]:
import json
import numpy as np

state_abbr_to_name = json.load(open('data/state_abbreviations.json'))
state_to_gun_law_strength_ranking = json.load(open('data/state_gun_law_strength_ranking.json'))

# Load the data
data_shootings = pd.read_csv('data/gun_violence_processed.csv')
fips_to_county_df = pd.read_csv('data/State and County FIPS Codes.csv')  # Load the new CSV for county names

data_shootings.loc[:,'count'] = 1  # Add a count column for aggregation
fips_to_county_df.rename(columns={'fips': 'FIPS', 'name':'county_name'}, inplace=True)  # Rename the FIPS column for consistency

# Group by FIPS code to calculate the number of mass shootings
shootings_df = data_shootings.groupby('FIPS').agg({
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_county_2023': 'first',
    'count': 'sum',  # Number of occurrences as number of shootings
    'State': 'first'
}).rename(columns={
    'count': 'shooting_count',  # Rename the count column
    'Population_per_county_2023': 'population'
}).reset_index()

# Ensure FIPS codes are strings and padded to 5 digits
shootings_df['FIPS'] = shootings_df['FIPS'].astype(str).str.zfill(5)
fips_to_county_df['FIPS'] = fips_to_county_df['FIPS'].astype(str).str.zfill(5)  # Ensure consistent formatting

# rename the state column using state_abbr_to_name, and place the conty_name if the state is nan
fips_to_county_df['state'] = fips_to_county_df.apply(
    lambda x: x['county_name'] if pd.isnull(x['state']) else state_abbr_to_name.get(x['state'], x['state']), axis=1
)

# Merge with fips_to_county_df to include all counties, using a left join
shootings_df = pd.merge(fips_to_county_df[['FIPS', 'county_name','state']], shootings_df, on='FIPS', how='left')

shootings_df['State'] = shootings_df['state']

shootings_df['count_per_100k'] = (shootings_df['shooting_count'] / shootings_df['population']) * 100000
shootings_df['deaths_per_100k'] = (shootings_df['Victims Killed'] / shootings_df['population']) * 100000
shootings_df['injuries_per_100k'] = (shootings_df['Victims Injured'] / shootings_df['population']) * 100000

# Fill NaN values with 0
shootings_df.fillna(0, inplace=True)

# Clip the values at 99th percentile to avoid outliers
shootings_df['count_per_100k_clipped'] = shootings_df['count_per_100k'].clip(upper=shootings_df['count_per_100k'].quantile(0.99))

shootings_df['gun_law_strength_ranking'] = shootings_df['State'].map(state_to_gun_law_strength_ranking)

def getClipOrOver(x):
    if x > shootings_df['count_per_100k'].quantile(0.99):
        return f">{np.round(shootings_df['count_per_100k'].quantile(0.99),2)}"
    return str(np.round(x,2))
shootings_df['count_per_100k_text'] = shootings_df['count_per_100k'].map(lambda x: getClipOrOver(x))

# Define the URL to the TopoJSON file (counties-10m.json)
counties_topojson = 'https://cdn.jsdelivr.net/npm/us-atlas@3/counties-10m.json'

# Background layer for county outlines with gray and transparent borders
background = alt.Chart(alt.topo_feature(counties_topojson, 'counties')).mark_geoshape(
    fill='lightgray',
    stroke='white',
).project(
    type='albersUsa'
).properties(
    width=800,
    height=500
)

# Data layer for the number of shootings with a reddish color scheme and pale white for NaN values
chart = alt.Chart(alt.topo_feature(counties_topojson, 'counties')).mark_geoshape().encode(
    color=alt.Color('count_per_100k_clipped:Q', scale=alt.Scale(scheme='reds'), title='Number of Mass Shootings'),
    tooltip=[alt.Tooltip('county_name:N',title='County Name'), alt.Tooltip('count_per_100k_text:N', title='Count per 100k', format='.2f'), alt.Tooltip('deaths_per_100k:Q',title='Deaths per 100k',format='.2f'), alt.Tooltip('injuries_per_100k:Q',title='Injured per 100k',format='.2f'), alt.Tooltip('State:N', title='State'), alt.Tooltip('gun_law_strength_ranking:Q', title='Gun Law Strength Ranking')]
).transform_lookup(
    lookup='id',  # 'id' in us-10m.json corresponds to county FIPS code
    from_=alt.LookupData(shootings_df, 'FIPS', ['count_per_100k_clipped', 'county_name', 'State', 'count_per_100k_text', 'deaths_per_100k', 'injuries_per_100k', 'gun_law_strength_ranking']),
    default=[0]
).project(
    type='albersUsa'
).properties(
    width=800,
    height=500,
    title='Number of Mass Shootings by County'
)

# Overlap a highlight of the state boundaries in pale white
state_boundaries = alt.Chart(alt.topo_feature(counties_topojson, 'states')).mark_geoshape(
    fill=None,
    stroke='white',
).project(
    type='albersUsa'
)

chart

In [5]:
from vega_datasets import data as vega_data

# ======== Chart dimensions ========
MAP_DIM = (600, 400)
LINE_CHART_DIM = (600, 400)

# Define a selection for the state
state_selection = alt.selection_point(fields=['State'], name='SelectState', empty='none')
region_selection = alt.selection_point(fields=['Region'], name='SelectRegion', empty='none')

year_selection = alt.selection_interval(fields=['Year'], encodings=['x'], translate=False)

# Create the choropleth map
state_map = alt.Chart(alt.topo_feature(vega_data.us_10m.url, feature='states')).mark_geoshape(
    stroke='white',
    strokeWidth=1
).encode(
    color=alt.Color(
        'Region:N',
        scale=alt.Scale(
            domain=['Southeast', 'Northeast', 'Midwest', 'Northwest', 'Southwest'],
            range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
        ),
    ),
    tooltip=['State:N', 'Count per 100k:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(state_statistics_df, 'StateFIPS', ['Count per 100k', 'State', 'Region'])
).add_params(
    state_selection, region_selection
).properties(
    width=MAP_DIM[0],
    height=MAP_DIM[1],
    title='Number of Mass Shootings per 100k'
).project(
    type='albersUsa'
)

# # generate a dataframe with all the possible titles to make the title dynamic
# title_df = state_data[["State", "region"]].copy()
# title_df["title"] = title_df.apply(
#     lambda x: f"Gun Violence in {x['State']} and {x['region']} by Year",
#     axis=1
# )

# line_chart_title = alt.Chart(title_df).mark_text(
#     align='left',
#     fontSize=16,
# ).encode(
#     text=alt.condition(
#         "datum.State == SelectState.State && datum.region == SelectRegion.Region",
#         'title:N',
#         alt.value('')
#     )
# )

counties_map = alt.Chart(alt.topo_feature(counties_topojson, 'counties')).mark_geoshape().encode(
    color=alt.Color('count_per_100k_clipped:Q', scale=alt.Scale(scheme='reds'), title='Number of Mass Shootings'),
    tooltip=[alt.Tooltip('county_name:N',title='County Name'), alt.Tooltip('count_per_100k_text:N', title='Count per 100k', format='.2f'), alt.Tooltip('deaths_per_100k:Q',title='Deaths per 100k',format='.2f'), alt.Tooltip('injuries_per_100k:Q',title='Injured per 100k',format='.2f'), alt.Tooltip('State:N', title='State'), alt.Tooltip('gun_law_strength_ranking:Q', title='Gun Law Strength Ranking')]
).transform_lookup(
    lookup='id',  # 'id' in us-10m.json corresponds to county FIPS code
    from_=alt.LookupData(shootings_df, 'FIPS', ['count_per_100k_clipped', 'county_name', 'State', 'count_per_100k_text', 'deaths_per_100k', 'injuries_per_100k', 'gun_law_strength_ranking']),
    default=[0]
).transform_filter(
    state_selection
).project(
    type='albersUsa', precision=.707
).properties(
    width=MAP_DIM[0],
    height=MAP_DIM[1],
)


alt.vconcat(
    state_map,
    counties_map,
).resolve_scale(color='independent')

Things to consider:
- Improve space usage (currently lots of space are wasted in the lower part of the plot).
- Add default behavior when no state is selected.