## Q1
How has the number of mass shootings evolved in the big US regions between two concrete years? For this, we need you to aggregate the data in the 5 regions (Southeast, Northeast, Midwest, Northwest, and Southwest), and let the user select the first and last year of the comparison. Same for states, both views coordinated.

In [1]:
import altair as alt
import pandas as pd

# Load the data
data_shootings = pd.read_csv('data/gun_violence_processed.csv')

state_data = pd.read_csv('data/state_data.csv').rename(columns={'state': 'State'})

# Add extra state information to the shootings data
data_shootings = data_shootings.merge(state_data, on='State')

data_shootings['Incident Date'] = pd.to_datetime(data_shootings['Incident Date'])
data_shootings['Year'] = data_shootings['Incident Date'].dt.year
data_shootings['Month'] = data_shootings['Incident Date'].dt.month

data_shootings.loc[:,'count'] = 1  # Add a count column for aggregation


# group by state and month
state_df = data_shootings.groupby(['State', 'Year']).agg({
    'region': 'first',
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_state_2023': 'first',
    'count': 'sum',
    'FIPS': 'first'
}).rename(columns={
    'region': 'Region',
    'Victims Killed': 'Total Victims Killed',
    'Victims Injured': 'Total Victims Injured',
    'Population_per_state_2023': 'Population',
    'count': 'Total Incidents'
}).reset_index()

# fill empty years with 0
all_years = state_df['Year'].unique()
all_states = state_df['State'].unique()

for state in all_states:
    pop = state_df[state_df['State'] == state]['Population'].values[0]
    region = state_df[state_df['State'] == state]['Region'].values[0]
    fips = state_df[state_df['State'] == state]['FIPS'].values[0]
    current_years = state_df[state_df['State'] == state]['Year'].values
    for year in all_years:
        if year not in current_years:
            state_df = pd.concat([state_df, pd.DataFrame([{
                'State': state,
                'Year': year,
                'Region': region,
                'Total Victims Killed': 0,
                'Total Victims Injured': 0,
                'Population': pop,
                'Total Incidents': 0,
                'FIPS': fips
            }])], ignore_index=True)
        
# dataset used for the choropleth map
state_statistics_df = state_df.groupby(['State']).agg({
    'Region': 'first',
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'first',
    'Total Incidents': 'sum',
    'FIPS': 'first'
}).reset_index()


region_df = state_df.groupby(['Region', 'Year']).agg({
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'sum',
    'Total Incidents': 'sum'
}).reset_index()

state_df["Type"] = "State"
region_df["Type"] = "Region"

state_df.rename(columns={"State": "Area"}, inplace=True)
region_df.rename(columns={"Region": "Area"}, inplace=True)


area_df = pd.concat([state_df, region_df])

# add average evolution in the US
us_df = state_df.groupby(['Year']).agg({
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'sum',
    'Total Incidents': 'sum'
}).reset_index()

us_df['Area'] = 'United States'
us_df['Type'] = 'Country'

area_df = pd.concat([area_df, us_df])

area_df['Count per 100k'] = area_df['Total Incidents'] / area_df['Population'] * 100_000


state_statistics_df['StateFIPS'] = state_statistics_df['FIPS'].astype(str).str[:-3].astype(int)

state_statistics_df['Count per 100k'] = state_statistics_df['Total Incidents'] / state_statistics_df['Population'] * 100_000


# TODO: drop useless columns
area_df

Unnamed: 0,Area,Year,Region,Total Victims Killed,Total Victims Injured,Population,Total Incidents,FIPS,Type,Count per 100k
0,Alabama,2014,Southeast,1,9,5108468,2,1039.0,State,0.039151
1,Alabama,2015,Southeast,4,13,5108468,4,1097.0,State,0.078301
2,Alabama,2016,Southeast,15,53,5108468,15,1045.0,State,0.293630
3,Alabama,2017,Southeast,6,38,5108468,7,1109.0,State,0.137027
4,Alabama,2018,Southeast,11,53,5108468,14,1091.0,State,0.274055
...,...,...,...,...,...,...,...,...,...,...
5,United States,2019,,428,1690,334914895,414,,Country,0.123613
6,United States,2020,,495,2526,334914895,611,,Country,0.182434
7,United States,2021,,668,2784,334914895,689,,Country,0.205724
8,United States,2022,,642,2647,334914895,644,,Country,0.192288


Load the preprocessed electoral data

In [2]:
area_electoral_df = pd.read_csv('data/area_electoral_df.csv')

area_electoral_df = area_electoral_df.merge(area_df[['Area', 'Year', 'Count per 100k']], on=['Area', 'Year'])

# keep only the party with the highest electoral votes in each area and year
area_electoral_highest_df = area_electoral_df.groupby(['Year', 'Area']).apply(
    lambda x: x.loc[x['Electoral votes'].idxmax()]
).reset_index(drop=True)

us_party_in_power = area_electoral_highest_df[area_electoral_highest_df['Area'] == 'United States'].reset_index(drop=True)

area_electoral_highest_df

  area_electoral_highest_df = area_electoral_df.groupby(['Year', 'Area']).apply(


Unnamed: 0,Year,Area,Party,Electoral votes,Count per 100k
0,2014,Alabama,Republican,9,0.039151
1,2014,Alaska,Republican,3,0.136350
2,2014,Arizona,Republican,11,0.013457
3,2014,Arkansas,Republican,6,0.065195
4,2014,California,Democrat,55,0.105222
...,...,...,...,...,...
565,2023,Virginia,Democrat,13,0.080315
566,2023,Washington,Democrat,12,0.191991
567,2023,West Virginia,Republican,5,0.000000
568,2023,Wisconsin,Democrat,10,0.152260


In [3]:
from vega_datasets import data as vega_data

# ======== Chart dimensions ========
MAP_DIM = (400, 250)
LINE_CHART_DIM = (450, 320)
TITLE_HEIGHT = 20

# Define categorical palette for the regions
palette = alt.Scale(
    domain=['Southeast', 'Northeast', 'Midwest', 'Northwest', 'Southwest'],
    range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
)

# Define a selection for the state
state_selection = alt.selection_point(fields=['State'], name='SelectState', empty='none')
region_selection = alt.selection_point(fields=['Region'], name='SelectRegion', empty='none')

year_selection = alt.selection_interval(fields=['Year'], encodings=['x'], translate=False)

states_topo = alt.topo_feature(vega_data.us_10m.url, feature='states')

state_map = alt.Chart(states_topo).mark_geoshape(
    stroke='white',
    strokeWidth=1
).encode(
    color=alt.Color(
        'Region:N',
        scale=palette,
        legend=alt.Legend(orient='top', offset=-TITLE_HEIGHT)
    ),
    opacity={
        "condition": [
            {"test": "datum.State == SelectState.State", "value": 1},
            {"test": "datum.Region == SelectRegion.Region", "value": 0.8}
        ],
        "value": 0.6
    },
    tooltip=['State:N', 'Region:N']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(state_statistics_df, 'StateFIPS', ['State', 'Region'])
).add_params(
    state_selection, region_selection
).properties(
    width=MAP_DIM[0],
    height=MAP_DIM[1],
).project(
    type='albersUsa'
)

# generate a dataframe with all the possible titles to make the title dynamic
title_df = state_data[["State", "region"]].copy()
title_df["title"] = title_df.apply(
    lambda x: f"Gun Violence evolution in {x['State']} and {x['region']}",
    axis=1
)

# add a base title where state and region are not selected
title_df = pd.concat([title_df, pd.DataFrame([{
    'State': pd.NA,
    'region': pd.NA,
    'title': 'Gun Violence evolution in the United States'
}])])

line_chart_title = alt.Chart(title_df).mark_text(
    align='left',
    fontSize=13,
    fontWeight='bold',
).encode(
    text=alt.condition(
        "datum.State == SelectState.State && datum.region == SelectRegion.Region",
        'title:N',
        alt.value('')
    )
).properties(
    height=TITLE_HEIGHT
)

line_chart = alt.Chart(area_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title='', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Count per 100k:Q'),
    color=alt.Color(
        'Area:N',
        scale=alt.Scale(scheme='category20'),
        sort=alt.Sort(field='Type', order='ascending')
    ),
    tooltip=['Year:O', 'Count per 100k:Q']
).transform_filter(
    "datum.Area == SelectState.State || datum.Area == SelectRegion.Region || datum.Area == 'United States'"
).transform_filter(
    year_selection
).properties(
    width=LINE_CHART_DIM[0],
    height=LINE_CHART_DIM[1] - TITLE_HEIGHT
)


year_selector = alt.Chart(us_party_in_power).mark_rect().encode(
    x=alt.X('Year:O', title='', axis=None),
    color=alt.Color(
        'Party:N',
        title='Party in power',
        scale=alt.Scale(
            domain=['Democrat', 'Republican'],
            range=['blue', 'red']
        ),
        legend=alt.Legend(orient='bottom')
    ),
    opacity=alt.condition(
        year_selection,
        alt.value(0.8),
        alt.value(0.4)
    ),
    tooltip=['Party']
).properties(
    width=MAP_DIM[0],
    height=15
).add_params(
    year_selection
)

year_text = alt.Chart(us_party_in_power).mark_text(
    align='center',
    baseline='middle',
    dy=1,
    fontWeight='bold',
).encode(
    x=alt.X('Year:O'),
    text=alt.Text('Year:O')
)

year_selector += year_text

# create an empty chart to make the final plot aligned
empty_chart = alt.Chart(height=TITLE_HEIGHT, width=MAP_DIM[0]).mark_circle(opacity=0)

state_map = empty_chart & state_map

def merged_Q1_chart(state_map, year_selector, line_chart_title, line_chart):
    return alt.hconcat(
        alt.vconcat(
            state_map,
            year_selector
        ).resolve_scale(
            color='independent',
        ),
        alt.vconcat(
            line_chart_title,
            line_chart
        ),
    ).resolve_scale(
        color='independent'
    ).configure_title(
        fontSize=16
    )

plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title, line_chart)
plot1

In [4]:
# Add labels to the line chart to show the different states
labels = alt.Chart(area_df).mark_text(align='left', dx=5, dy=-5).encode(
    x=alt.X('Year:O', title="", aggregate='max'),
    y=alt.Y('Count per 100k:Q', title="Count per 100k", aggregate={'argmax': 'Year'}),
    text=alt.Text('Area:N'),
    color=alt.Color('Area:N'),
).transform_filter(
    "datum.Area == SelectState.State || datum.Area == SelectRegion.Region || datum.Area == 'United States'"
).transform_filter(
    year_selection
)

line_chart_labeled = line_chart.copy()

# remove the legend and padding to get a cleaner chart
line_chart_labeled.encoding.color.legend = None
line_chart_labeled.encoding.x.scale = alt.Scale(padding=0)

line_chart_labeled += labels


plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title, line_chart_labeled)
plot1

For some of the states, the labels overlap, therefore we will keep the legend as before

In [5]:
# Add background color to the line chart in terms of the political party in power
RD_background = alt.Chart(us_party_in_power).mark_rect(opacity=0.2).encode(
    x=alt.X('Year:O', title='', scale=alt.Scale(padding=0)),
    color=alt.Color(
        'Party:N',
        scale=alt.Scale(domain=['Democrat', 'Republican'], range=['blue', 'red']),
        legend=None
    )
).transform_filter(
    year_selection
)


plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title,
                        (line_chart + RD_background).resolve_scale(color='independent'))
plot1

Too much clutter. Poor data to ink ratio.

In [6]:
# Add colored points to the line chart in terms of the political party in power
party_points_chart = alt.Chart(area_electoral_highest_df).mark_point(filled=True).encode(
    x=alt.X('Year:O', title='', axis=alt.Axis(labelAngle=0), scale=alt.Scale(padding=0)),
    y=alt.Y('Count per 100k:Q'),
    color=alt.Color(
        'Party:N',
        scale=alt.Scale(domain=['Democrat', 'Republican'], range=['blue', 'red']),
    ),
    size=alt.value(40),
    opacity=alt.value(0.8),
    tooltip=['Year:O', alt.Tooltip('Count per 100k:Q',format='.2f'), 'Party:N', 'Electoral votes:Q']
).transform_filter(
    "datum.Area == SelectState.State || datum.Area == SelectRegion.Region || datum.Area == 'United States'"
).transform_filter(
    year_selection
).properties(
    width=LINE_CHART_DIM[0],
    height=LINE_CHART_DIM[1] - TITLE_HEIGHT
)


plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title,
                        (line_chart.mark_line() + party_points_chart).resolve_scale(color='independent'))
plot1

In [7]:
# Define the fixed y-axis scale
max_state_count = area_df[area_df['Area'] != 'District of Columbia']['Count per 100k'].max()
fixed_y_scale = alt.Scale(domain=[0, max_state_count])

line_chart_scale_fix = line_chart.mark_line().copy()
party_points_chart_scale_fix = party_points_chart.copy()

line_chart_scale_fix.encoding.y.scale = fixed_y_scale
party_points_chart_scale_fix.encoding.y.scale = fixed_y_scale

plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title,
                        (line_chart_scale_fix + party_points_chart_scale_fix).resolve_scale(color='independent',strokeDash='independent'))
plot1

In [8]:
line_chart_stroke = line_chart.mark_line().copy()
# add a stroke dash depending on the area
line_chart_stroke.encoding.strokeDash = alt.StrokeDash('Area:N', sort=alt.Sort(field='Type', order='ascending'))

plot1 = merged_Q1_chart(state_map, year_selector, line_chart_title,
                        (line_chart_stroke + party_points_chart).resolve_scale(color='independent',strokeDash='independent'))
plot1

Things to improve:
- Fix color legend of lineplot not to match with map colors.

Description:
The tasks to solve here are visualizing the evolution of mass shootings per region and state, allowing the user to selects year to compare. The default view is the evolution of incidents per 100k in the whole United States, which is important both to contextualize the evolution of the state/region with the overall national trend and to provide a reference point for the user as the y-axis scale is dynamic. The latter is done both to have better space utilization and less clutter. We also opted to add the party in power in the slider for the user to be able to respond queries through interaction such as "how did mass shootings evolve [in REGION] during the Trump administration?", which we believe speaks to the effects contributing to the evolution of mass shootings and therefore should have a place in the analysis and exploration. The map is relatively large with respect to its utility (a selector for a state/region) which is justified by its dual use as a selector for the state for Q3. The arrangement of the legends is done to maximize the space given to the lineplot, and keep the information at reach. We opted against portraying the party in power in the lineplot as it is already shown in the slider (redundant) and it introduces clutter. The lineplot also includes a dynamic title to help fixate the context for the analysis and avoid eye movement to the selected region to remind of the plot being shown. Finally, we opted to encode the different lines in the lineplot with line strokes as well as colors to make the plot more accessible to colorblind users.

## Q2
todo

## Q3
For the visualization in Q1, we would like you to be able to select a state, and show the
detailed information on the counties of the state.

In [9]:
import json

state_abbr_to_name = json.load(open('data/state_abbreviations.json'))

# Load the data
fips_to_county_df = pd.read_csv('data/State and County FIPS Codes.csv')  # Load the new CSV for county names

data_shootings.loc[:,'count'] = 1  # Add a count column for aggregation
fips_to_county_df.rename(columns={'fips': 'FIPS', 'name':'county_name'}, inplace=True)  # Rename the FIPS column for consistency

# Group by FIPS code to calculate the number of mass shootings
county_statistics_df = data_shootings.groupby('FIPS').agg({
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_county_2023': 'first',
    'count': 'sum',  # Number of occurrences as number of shootings
}).rename(columns={
    'count': 'shooting_count',  # Rename the count column
    'Population_per_county_2023': 'population'
}).reset_index()

# Ensure FIPS codes are strings and padded to 5 digits
county_statistics_df['FIPS'] = county_statistics_df['FIPS'].astype(str).str.zfill(5)
fips_to_county_df['FIPS'] = fips_to_county_df['FIPS'].astype(str).str.zfill(5)  # Ensure consistent formatting

# rename the state column using state_abbr_to_name, and place the conty_name if the state is nan
fips_to_county_df['state'] = fips_to_county_df.apply(
    lambda x: x['county_name'] if pd.isnull(x['state']) else state_abbr_to_name.get(x['state'], x['state']), axis=1
)

# Merge with fips_to_county_df to include all counties, using a left join
county_statistics_df = pd.merge(fips_to_county_df[['FIPS', 'county_name','state']], county_statistics_df, on='FIPS', how='left')

county_statistics_df['Count per 100k'] = (county_statistics_df['shooting_count'] / county_statistics_df['population']) * 100_000
county_statistics_df['Deaths per 100k'] = (county_statistics_df['Victims Killed'] / county_statistics_df['population']) * 100_000
county_statistics_df['Injured per 100k'] = (county_statistics_df['Victims Injured'] / county_statistics_df['population']) * 100_000

# Fill NaN values with 0
county_statistics_df.fillna(0, inplace=True)

# rename county_name and state columns
county_statistics_df.rename(columns={'county_name': 'County', 'state': 'State'}, inplace=True)

county_statistics_df = county_statistics_df[['FIPS', 'State', 'County', 'Count per 100k', 'Deaths per 100k', 'Injured per 100k']]

county_statistics_df.columns

Index(['FIPS', 'State', 'County', 'Count per 100k', 'Deaths per 100k',
       'Injured per 100k'],
      dtype='object')

In [10]:
counties_topojson = 'https://cdn.jsdelivr.net/npm/us-atlas@3/counties-10m.json'

# Data layer for the number of shootings with a reddish color scheme and pale white for NaN values
chart = alt.Chart(alt.topo_feature(counties_topojson, 'counties')).mark_geoshape().encode(
    color=alt.Color(
        'Count per 100k:Q',
        scale=alt.Scale(
            scheme='reds',
            domain=[0, county_statistics_df['Count per 100k'].quantile(0.99)]
        ),
        title='Number of Mass Shootings'
    ),
    tooltip=[
        'County:N',
        'State:N',
        'Count per 100k:Q',
        'Deaths per 100k:Q',
        'Injured per 100k:Q'
    ]
).transform_lookup(
    lookup='id',  # 'id' in us-10m.json corresponds to county FIPS code
    from_=alt.LookupData(county_statistics_df, 'FIPS', ['Count per 100k', 'County', 'State', 'Deaths per 100k', 'Injured per 100k']),
    default=[0]
).project(
    type='albersUsa', precision=.707
).properties(
    width=800,
    height=500,
    title='Number of Mass Shootings by County'
)

chart