In [26]:
import altair as alt
import pandas as pd

# Load the data
data_shootings = pd.read_csv('data/gun_violence_processed.csv')

state_data = pd.read_csv('data/state_data.csv').rename(columns={'state': 'State'})

# Add extra state information to the shootings data
data_shootings = data_shootings.merge(state_data, on='State')

data_shootings['Incident Date'] = pd.to_datetime(data_shootings['Incident Date'])
data_shootings['Year'] = data_shootings['Incident Date'].dt.year
data_shootings['Month'] = data_shootings['Incident Date'].dt.month

data_shootings.loc[:,'count'] = 1  # Add a count column for aggregation


# group by state and month
state_df = data_shootings.groupby(['State', 'Year']).agg({
    'region': 'first',
    'Victims Killed': 'sum',
    'Victims Injured': 'sum',
    'Population_per_state_2023': 'first',
    'count': 'sum',
    'FIPS': 'first'
}).rename(columns={
    'region': 'Region',
    'Victims Killed': 'Total Victims Killed',
    'Victims Injured': 'Total Victims Injured',
    'Population_per_state_2023': 'Population',
    'count': 'Total Incidents'
}).reset_index()

# fill empty years with 0
all_years = state_df['Year'].unique()
all_states = state_df['State'].unique()

for state in all_states:
    pop = state_df[state_df['State'] == state]['Population'].values[0]
    region = state_df[state_df['State'] == state]['Region'].values[0]
    fips = state_df[state_df['State'] == state]['FIPS'].values[0]
    current_years = state_df[state_df['State'] == state]['Year'].values
    for year in all_years:
        if year not in current_years:
            state_df = pd.concat([state_df, pd.DataFrame([{
                'State': state,
                'Year': year,
                'Region': region,
                'Total Victims Killed': 0,
                'Total Victims Injured': 0,
                'Population': pop,
                'Total Incidents': 0,
                'FIPS': fips
            }])], ignore_index=True)
        
# dataset used for the choropleth map
state_statistics_df = state_df.groupby(['State']).agg({
    'Region': 'first',
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'first',
    'Total Incidents': 'sum',
    'FIPS': 'first'
}).reset_index()


region_df = state_df.groupby(['Region', 'Year']).agg({
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'sum',
    'Total Incidents': 'sum'
}).reset_index()

state_df["Type"] = "State"
region_df["Type"] = "Region"

state_df.rename(columns={"State": "StateOrRegion"}, inplace=True)
region_df.rename(columns={"Region": "StateOrRegion"}, inplace=True)


state_region_df = pd.concat([state_df, region_df])

# add average evolution in the US
us_df = state_df.groupby(['Year']).agg({
    'Total Victims Killed': 'sum',
    'Total Victims Injured': 'sum',
    'Population': 'sum',
    'Total Incidents': 'sum'
}).reset_index()

us_df['StateOrRegion'] = 'United States'
us_df['Type'] = 'Country'

state_region_df = pd.concat([state_region_df, us_df])

state_region_df['Count per 100k'] = state_region_df['Total Incidents'] / state_region_df['Population'] * 100_000


state_statistics_df['StateFIPS'] = state_statistics_df['FIPS'].astype(str).str[:-3].astype(int)

state_statistics_df['Count per 100k'] = state_statistics_df['Total Incidents'] / state_statistics_df['Population'] * 100_000

# TODO: drop useless columns
state_region_df

Unnamed: 0,StateOrRegion,Year,Region,Total Victims Killed,Total Victims Injured,Population,Total Incidents,FIPS,Type,Count per 100k
0,Alabama,2014,Southeast,1,9,5108468,2,1039.0,State,0.039151
1,Alabama,2015,Southeast,4,13,5108468,4,1097.0,State,0.078301
2,Alabama,2016,Southeast,15,53,5108468,15,1045.0,State,0.293630
3,Alabama,2017,Southeast,6,38,5108468,7,1109.0,State,0.137027
4,Alabama,2018,Southeast,11,53,5108468,14,1091.0,State,0.274055
...,...,...,...,...,...,...,...,...,...,...
5,United States,2019,,428,1690,334914895,414,,Country,0.123613
6,United States,2020,,495,2526,334914895,611,,Country,0.182434
7,United States,2021,,668,2784,334914895,689,,Country,0.205724
8,United States,2022,,642,2647,334914895,644,,Country,0.192288


In [27]:
region_df['Count per 100k'] = region_df['Total Incidents'] / region_df['Population'] * 100_000

In [28]:
import altair as alt
import pandas as pd

SLOPE_CHART_DIM = (400, 400)
BAR_CHART_DIM = (400, 400)

# Define categorical palette for the regions
palette = alt.Scale(
    domain=['Southeast', 'Northeast', 'Midwest', 'Northwest', 'Southwest'],
    range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
)

# Define the base year
base_year = all_years[0]

region_df.rename(columns={'StateOrRegion': 'Region'}, inplace=True)
region_df = region_df.sort_values(by=['Region', 'Year'])

# Calculate percentage change with respect to the base year
region_df['Percentage Change'] = region_df.groupby('Region')['Count per 100k'].apply(lambda x: (x - x.iloc[0]) / x.iloc[0] * 100).reset_index(level=0, drop=True)

# Create a selection slider for the year with ordinal values
year_select = alt.selection_point(fields=['Year'], value=[{'Year': all_years[1]}], name='YearSelect')


title_df = region_df[["Year"]].copy()
title_df["title"] = title_df.apply(
    lambda x: f"Comparison of Total Gun Incidents per 100k citizens: {base_year} vs {x['Year']}",
    axis=1
)

slope_chart_title = alt.Chart(title_df).mark_text(
    align='left',
    fontSize=13,
    fontWeight='bold',
).encode(
    text=alt.condition(
        "datum.Year == YearSelect.Year",
        'title:N',
        alt.value('')
    )
)

# Slope chart
slope_chart = alt.Chart(region_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title=''),
    y='Count per 100k:Q',
    color=alt.Color('Region:N', scale=palette),
    detail='Region:N',
    tooltip=['Region:N', 'Count per 100k:Q']
).transform_filter(
    f"datum.Year == YearSelect.Year || datum.Year == {base_year}"
).properties(
    width=SLOPE_CHART_DIM[0],
    height=SLOPE_CHART_DIM[1]-100,
    #title=f'Total Incidents by Region Compared to {base_year}'
)

title_df = region_df[["Year"]].copy()
title_df["title"] = title_df.apply(
    lambda x: f"Percentage Change in Total Gun Incidents from {base_year} to {x['Year']}",
    axis=1
)

bar_plot_title = alt.Chart(title_df).mark_text(
    align='left',
    fontSize=13,
    fontWeight='bold',
).encode(
    text=alt.condition(
        "datum.Year == YearSelect.Year",
        'title:N',
        alt.value('')
    )
)

# Bar plot for percentage change
bar_plot = alt.Chart(region_df).mark_bar().encode(
    x=alt.X('Region:N', title=''),
    y='Percentage Change:Q',
    color=alt.Color('Region:N', scale=palette),
    tooltip=['Region:N', alt.Tooltip('Percentage Change:Q', format='.2f')]
).transform_filter(
    year_select
).properties(
    width=BAR_CHART_DIM[0],
    height=BAR_CHART_DIM[1]-100,
    #title=alt.TitleParams(
    #    text=f'Percentage Change in Total Incidents by Region Compared to {base_year}',
    #    align='center',
    #    angle=0
    #)
)

year_data = pd.DataFrame({
    'Year': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023],
})

year_selector = alt.Chart(year_data).mark_rect().encode(
    x=alt.X('Year:O', title='', axis=None),
    opacity=alt.condition(year_select, alt.value(1.0), alt.value(0.6)),
    color=alt.value('grey')
).properties(
    width=SLOPE_CHART_DIM[0],
    height=15
).add_params(
    year_select
)

year_text = alt.Chart(year_data).mark_text(
    align='center',
    baseline='middle',
    dy=1,
    fontWeight='bold',
).encode(
    x=alt.X('Year:O'),
    text=alt.Text('Year:O')
)

# Combine the charts and add the slider
combined_chart = alt.hconcat(
    alt.vconcat(slope_chart_title,slope_chart,year_selector + year_text),alt.vconcat(bar_plot_title,bar_plot)
).configure_axisX(
    labelAngle=0
).resolve_scale(
    x='independent'
)

combined_chart

Description:
The task we are trying to solve is to see the evolution of gun violence per citizen between the first recorded year and a concrete year by region, both on an absolute and relative scale. Having both a slope chart and a bar chart with the percentage differences give complementary information. In the first, the user has access to absolute values in terms of incident count per 100k citizens, which solves the task of comparing incident count evolution of gun violence per citizen between regions for a given year. In the second, the user can see the percentage increase or decrease between the two years, which solves the task of comparing relative variations (tendencies) in gun violence between regions for a given year. For example, in 2019 the highest incident count per citizen increase was in the Southeast or Midwest, but in terms of variation the highest relative variation was in the Southwest, with a 100% increase in incidents per citizen.
The y-scale in both plots is dynamic, which is allowed by there being a fixed reference point in the slope chart (the value in the first recorded year, which allows user to situate itself when comparing) and the fact that we are looking for the comparison between a year and the first recorded year, not a comparison of the increases with respect to the first recorded year through different years. We added tooltips to facilitate exact value search which provide useful especially when the y-axis is not fixed (and therefore there is no reference axis that the user stores in memory).
