In [1]:
import pandas as pd
import numpy as np
import os.path
import matplotlib.pyplot as plt
import seaborn as sns


# Method which aggregates all census tracts in a county
def mergeCountyData(df):
    # Total county population
    mergedDF = df
    mergedDF["Population"] = mergedDF.groupby("county_name_x")[
        "Total population"
    ].transform("sum")

    # Proportion of county in census tract
    mergedDF["County proportion"] = (
        mergedDF["Total population"] / mergedDF["Population"]
    )

    # Mean household income, scaled by proportion
    mergedDF["Proportional county mean income"] = (
        mergedDF["Mean household income (dollars)"] * mergedDF["County proportion"]
    )

    # Aggregate mean income
    mergedDF["Mean income"] = mergedDF.groupby("county_name_x")[
        "Proportional county mean income"
    ].transform("sum")

    # Median household income, scaled by proportion
    mergedDF["Proportional county median income"] = (
        mergedDF["Median household income (dollars)"] * mergedDF["County proportion"]
    )

    # Aggregate Median income
    mergedDF["Median income"] = mergedDF.groupby("county_name_x")[
        "Proportional county median income"
    ].transform("sum")

    # Mean math score, scaled by proportion
    mergedDF["Proportional county math score"] = (
        mergedDF["Math Scores"] * mergedDF["County proportion"]
    )

    # Aggregate mean math score
    mergedDF["Math performance"] = np.round(
        mergedDF.groupby("county_name_x")["Proportional county math score"].transform(
            "sum"
        )
    )

    # Mean reading/writing score, scaled by proportion
    mergedDF["Proportional county RW score"] = (
        mergedDF["RW Scores"] * mergedDF["County proportion"]
    )

    # Aggregate mean reading/writing score
    mergedDF["Reading and writing performance"] = np.round(
        mergedDF.groupby("county_name_x")["Proportional county RW score"].transform(
            "sum"
        )
    )

    # Population with college degree
    mergedDF["Population with college degree"] = (
        mergedDF["18-24, bachelor's degree or higher"]
        + mergedDF["25 and over, bachelor's degree (estimate)"]
        + mergedDF["25 and over, graduate or professional degree (estimate)"]
    )

    # Percentage with college scaled by proportion
    mergedDF["Proportional college pergentage"] = (
        mergedDF["Population with college degree"] / mergedDF["Total population"]
    ) * mergedDF["County proportion"]

    # Aggregate percentage with college
    mergedDF["Percentage with college degree"] = np.round(
        100
        * mergedDF.groupby("county_name_x")[
            "Proportional college pergentage"
        ].transform("sum"),
        decimals=1,
    )

    return mergedDF

# Create a dict containing data for each state
stateData = {
    "Texas": {
        "StateAbbr": "TX",
        "StateCode": "48",
        "StateCenter": {"lat": 31, "lon": -99},
    },
    "Massachusetts": {
        "StateAbbr": "MA",
        "StateCode": "25",
        "StateCenter": {"lat": 42, "lon": -71},
    },
    "Arkansas": {
        "StateAbbr": "AR",
        "StateCode": "05",
        "StateCenter": {"lat": 34, "lon": -92},
    },
    "Illinois": {
        "StateAbbr": "IL",
        "StateCode": "17",
        "StateCenter": {"lat": 39, "lon": -89},
    },
}

def getStateFromCode(countyCode):
    stateCode = countyCode[0:2]
    stateName = "State"
    for state in stateData:
        if stateData[state]["StateCode"]==stateCode:
            stateName=state
    return stateName
    
    

# Import mapping data
import json

countiesFile = open("./geographic_data/geojson-counties-fips.json", "r")
countiesJSON = json.load(countiesFile)
countiesFile.close()


# Create mapping data localized to state
def createStateGeo(source, stateCode):
    stateGeo = {"type": "FeatureCollection", "features": []}
    stateFeatures = []
    for county in source["features"]:
        if county["properties"]["STATE"] == stateCode:
            stateFeatures.append(county)
    stateGeo["features"] = stateFeatures
    return stateGeo


for state in stateData:
    stateData[state]["GeoJSON"] = createStateGeo(
        countiesJSON, stateData[state]["StateCode"]
    )


# Create mapping data localized to ocunty
def createCountyGeo(source, countyName):
    countyGeo = {"type": "FeatureCollection", "features": []}
    for county in range(len(source["GeoJSON"]["features"])):
        name = source["GeoJSON"]["features"][county]["properties"]["NAME"] + " County"
        if name == countyName:
            countyGeo["features"] = [source["GeoJSON"]["features"][county]]
    return countyGeo


# Load combined census+education dataframes into StateData. Each state will have a separate DF for each year, as well as one aggregated over all years.
for state in stateData:
    aggregated = pd.DataFrame()
    for year in range(19, 24):
        stateYear = stateData[state]["StateAbbr"] + str(year)

        # Exception handling for loading the csv
        df = pd.DataFrame()
        filepath = (
            "../../data/"
            + stateYear
            + "/"
            + stateYear
            + "_Combined_Census_Education.csv"
        )
        if os.path.exists(filepath):
            df = pd.read_csv(filepath).dropna(
                subset=[
                    "Math Scores",
                    "RW Scores",
                    "Total population",
                    "Mean household income (dollars)",
                    "Median household income (dollars)",
                    "18-24, bachelor's degree or higher",
                    "25 and over, bachelor's degree (estimate)",
                    "25 and over, graduate or professional degree (estimate)",
                ]
            )
            
            df = df.rename(columns={"year":"Year"})

            # Drop any rows with negative income values
            df = df[df["Mean household income (dollars)"] > 0]
            df = df[df["Median household income (dollars)"] > 0]

            # Arkansas uses ACT, we rescale for consistency
            if state == "Arkansas":
                df["ACT Math Scores"] = df["Math Scores"]
                df["ACT RW Scores"] = df["RW Scores"]
                df["Math Scores"] = df["Math Scores"] * 800 / 30
                df["RW Scores"] = df["RW Scores"] * 800 / 30

            # Sort by school name ahead of concatenating
            df.sort_values(by="School Name")

            # Format strings
            df["School Name"] = df["School Name"].str.title()

            # Add aggregated county data to the dataframe
            df = mergeCountyData(df)

            # Format the county code
            intcode = df["county_code"].astype(int)
            df["county_code"] = intcode.astype(str).str.zfill(5)

            # Add the DF to StateData, and aggregated
            stateData[state][str(year) + "DF"] = df

            aggregated = pd.concat([aggregated, df])

    aggregated["state_name"] = state
    stateData[state]["AggregatedDF"] = aggregated

# Create separate DFs of all states in a given year
aggregatedByYearDFs = {}
aggregatedByYearDFs["19"] = pd.concat(
    [
        stateData["Texas"]["19DF"],
        stateData["Arkansas"]["19DF"],
        stateData["Massachusetts"]["19DF"],
        stateData["Illinois"]["19DF"],
    ]
)
aggregatedByYearDFs["20"] = pd.concat(
    [
        stateData["Texas"]["20DF"],
        stateData["Massachusetts"]["20DF"],
        stateData["Illinois"]["20DF"],
    ]
)
aggregatedByYearDFs["21"] = pd.concat(
    [
        stateData["Texas"]["21DF"],
        stateData["Arkansas"]["21DF"],
        stateData["Massachusetts"]["21DF"],
        stateData["Illinois"]["21DF"],
    ]
)
aggregatedByYearDFs["22"] = pd.concat(
    [
        stateData["Texas"]["22DF"],
        stateData["Arkansas"]["22DF"],
        stateData["Massachusetts"]["22DF"],
        stateData["Illinois"]["22DF"],
    ]
)
aggregatedByYearDFs["23"] = pd.concat(
    [
        stateData["Texas"]["23DF"],
        stateData["Arkansas"]["23DF"],
        stateData["Massachusetts"]["23DF"],
        stateData["Illinois"]["23DF"],
    ]
)

# Create a DF of all states and years together
aggregatedDF = pd.concat(
    [
        stateData["Arkansas"]["AggregatedDF"],
        stateData["Texas"]["AggregatedDF"],
        stateData["Illinois"]["AggregatedDF"],
        stateData["Massachusetts"]["AggregatedDF"],
    ]
)

We next create the options functionality which allows the user to choose which data is being used in mapping and plotting

In [2]:
USMapOptions = [
    "Population",
    "Mean income",
    "Median income",
    "Math performance",
    "Reading and writing performance",
    "Percentage with college degree",
]
AxisOptions = [
    "Year",
    "Population",
    "Mean income",
    "Median income",
    "Math performance",
    "Reading and writing performance",
    "Percentage with college degree",
]
dataLegends = {
    "Population range": [0, 1000000],
    "Population color": "Spectral",
    "Mean income range": [0, 250000],
    "Mean income color": "Spectral",
    "Median income range": [0, 250000],
    "Median income color": "Spectral",
    "Math performance range": [0, 800],
    "Math performance color": "Spectral",
    "Reading and writing performance range": [0, 800],
    "Reading and writing performance color": "Spectral",
    "Percentage with college degree range": [0, 50],
    "Percentage with college degree color": "Spectral",
}

From here we create the Dash app with dynamic data exploration. It has three maps - at the national level, state level, and county level. There are also plots for displaying localized data.

In [3]:
# Dash setup
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import plotly.graph_objects as go

plotMarkerColor = "#BA2049"
mapMarkerColor = "#FFFFFF"
app = Dash()

app.title = "Census and education performance analysis"


USMap = dcc.Graph(id="USMap", relayoutData={}, hoverData={}, figure={})
stateMap = dcc.Graph(id="StateMap", relayoutData={}, hoverData={}, figure={})
countyMap = dcc.Graph(id="CountyMap", relayoutData={}, hoverData={}, figure={})
countyPlot = dcc.Graph(id="CountyPlot", relayoutData={}, hoverData={}, figure={})
USClickPlot = dcc.Graph(id="USClickPlot", relayoutData={}, hoverData={}, figure={})
USClickData = [
    html.Div(
        id="USClickPlotContainer",
        children=[html.H1(id="USClickDataTitle"), USClickPlot],
    )
]
stateClickData = [html.H1(id="StateClickDataTitle"), countyMap, countyPlot]
app.layout = html.Div(
    id="app-div",
    children=[
        html.Div(id="HeaderContainer", children=[html.H1(app.title)]),
        html.Div(
            id="HighLevelContainer",
            children=[
                html.Div(
                    id="HighLevelLeft",
                    children=[
                        dcc.Dropdown(
                            id="USMapSelection",
                            options=USMapOptions,
                            value=USMapOptions[0],
                        ),
                        dcc.Dropdown(
                            id="YearSelection",
                            options=["19", "20", "21", "22", "23"],
                            value="19",
                        ),
                        html.Div(
                            id="USMapContainer",
                            children=[
                                USMap,
                            ],
                        ),
                    ],
                ),
                html.Div(
                    id="HighLevelRight",
                    children=[
                        dcc.Dropdown(
                            id="XAxisSelection",
                            options=AxisOptions,
                            value=AxisOptions[0],
                        ),
                        dcc.Dropdown(
                            id="YAxisSelection",
                            options=USMapOptions,
                            value=USMapOptions[0],
                        ),
                        html.Div(
                            id="USClickContainer",
                            children=[
                                html.Div(id="USClickData", children=USClickData),
                            ],
                        ),
                    ],
                ),
            ],
        ),
        html.Div(
            id="LowLevelContainer",
            children=[
                dcc.Dropdown(
                    id="StateMapSelection",
                    options=["Illinois", "Massachusetts", "Arkansas", "Texas"],
                    value="Texas",
                ),
                html.Div(
                    children=[
                        html.Div(
                            id="StateMapContainer",
                            children=[
                                html.H1(id="StateTitle"),
                                stateMap,
                            ],
                        ),
                    ]
                ),
                html.Div(id="CountyMapContainer", children=stateClickData),
            ],
        ),
    ],
)

Below are all the callback functions.

In [4]:
# Updates the state map according selected state
@app.callback(
    [
        Output(component_id="StateMap", component_property="figure"),
        Output(component_id="StateMapSelection", component_property="value"),
        Output(component_id="StateTitle", component_property="children"),
    ],
    Input(component_id="USMapSelection", component_property="value"),
    Input(component_id="YearSelection", component_property="value"),
    Input(component_id="StateMapSelection", component_property="value"),
    Input(component_id="CountyPlot", component_property="figure"),
)
def drawStateMap(USSelection, yearSelection, stateSelection, countyPlot):
    # Error handling for missing AR data
    if yearSelection == "20" and (
        stateSelection == "Arkansas" or stateSelection == None
    ):
        stateSelection = "Texas"

    StateDF = stateData[stateSelection][yearSelection + "DF"]

    map = px.choropleth(
        StateDF,
        geojson=stateData[stateSelection]["GeoJSON"],
        color=USSelection,
        locations=StateDF["county_code"],
        color_continuous_scale="Spectral",
    )
    map.add_trace(
        go.Scattergeo(
            lat=StateDF["latitude"],
            lon=StateDF["longitude"],
            mode="markers",
            hoverinfo="skip",
            marker=dict(size=1, color=mapMarkerColor)
            
        )
    )
    map.update_geos(
        visible=False,
        resolution=50,
        fitbounds="locations",
    )
    map.update_layout(
        map_center=stateData[stateSelection]["StateCenter"],
        margin={"r": 0, "t": 0, "l": 0, "b": 0},
        width=600,
        dragmode=False,
        coloraxis_showscale=False,
    )
    return [map, stateSelection, stateSelection + " at a glance"]

In [5]:
# Updates the map of US according to selected data or clicking
@app.callback(
    [
        Output(component_id="USMap", component_property="figure"),
        Output(component_id="StateMapSelection", component_property="options"),
        Output(component_id="USClickDataTitle", component_property="children"),
        Output(component_id="USClickPlot", component_property="figure"),
    ],
    [
        Input(component_id="USMapSelection", component_property="value"),
        Input(component_id="StateMapSelection", component_property="value"),
        Input(component_id="YearSelection", component_property="value"),
        Input(component_id="USMap", component_property="relayoutData"),
        Input(component_id="USMap", component_property="clickData"),
        Input(component_id="XAxisSelection", component_property="value"),
        Input(component_id="YAxisSelection", component_property="value"),
    ],
)
def drawUSMap(USSelection, stateSelection, yearSelection, relayoutData, clickData, xAxis, yAxis):
    options = []

    df = aggregatedByYearDFs[yearSelection]
    dataForDisplay = USSelection + " " + yearSelection
    zoom = 3
    center = {"lat": 35, "lon": -95}
    minvalue = 0
    maxvalue = df[USSelection].max()

    # Error handling for missing AR data
    if yearSelection == "20":
        if stateSelection == "Arkansas" or stateSelection == None:
            stateSelection = "Texas"
        options = ["Illinois", "Massachusetts", "Texas"]
    else:
        options = ["Illinois", "Massachusetts", "Arkansas", "Texas"]

    # Set a default hover county code
    clickedCountyCode = "48001"
    clickedCountyName = "County"
    if clickData != None:
        if "points" in clickData:
            clickedCountyCode = clickData["points"][0]["location"]

    # Extract the state name from clicked county code
    clickedStateName = getStateFromCode(clickedCountyCode)

    # Find the aggregated data from this state
    newCountyData = stateData[clickedStateName]["AggregatedDF"]

    # Restrict the data to the county which was clicked
    newCountyData = newCountyData[newCountyData["county_code"] == clickedCountyCode]
    clickedCountyName = newCountyData["county_name_x"].iloc[0]

    # Trim data down to what is needed
    newCountyData = newCountyData.drop_duplicates()

    xAxisRange =[2018,2024]
    yAxisRange = dataLegends[yAxis+ " range"]
    if (xAxis!="Year"):
        xAxisRange=dataLegends[xAxis+ " range"]
    newCountyDataPlot = px.scatter(
        newCountyData,
        x=xAxis,
        y=yAxis,
        width=450,
        height=380,
        range_x=xAxisRange,
        range_y=yAxisRange,
    )

    newCountyDataPlot.update_layout(plot_bgcolor="#FFFFFF")
    newCountyDataPlot.update_traces(marker=dict(color=plotMarkerColor))
    newCountyDataPlot.update_xaxes(gridcolor="#d4dadc")
    newCountyDataPlot.update_yaxes(gridcolor="#d4dadc")

    if "map.zoom" in relayoutData:
        zoom = relayoutData["map.zoom"]

    if "map.center" in relayoutData:
        center = relayoutData["map.center"]

    map = go.Figure(
        go.Choroplethmap(
            geojson=countiesJSON,
            locations=df["county_code"],
            z=df[USSelection],
            zmin=dataLegends[USSelection + " range"][0],
            zmax=dataLegends[USSelection + " range"][1],
            colorscale="Spectral",
            marker_opacity=1,
            marker_line_width=0,
            colorbar=dict(
                title=dict(
                    text=dataForDisplay,
                    side="bottom",
                ),
                tickmode="linear",
                ticks="inside",
                tick0=0,
                dtick=maxvalue / 6,
                thickness=10,
                ticklen=10,
                tickwidth=4,
                tickcolor="#FFFFFF",
                outlinewidth=2,
                outlinecolor="#FFFFFF",
                orientation="h",
                yanchor="middle",
                yref="container",
                y=0.1,
            ),
        )
    )

    map.update_layout(
        map_style="carto-positron-nolabels",
        width=800,
        map_zoom=zoom,
        map_center=center,
        margin={"r": 0, "t": 0, "l": 0, "b": 0},
    )
    return [map, options, clickedCountyName+" yearly averages", newCountyDataPlot]

In [6]:
# Update the county map based on state click
@app.callback(
    [
        Output(component_id="StateClickDataTitle", component_property="children"),
        Output(component_id="CountyMap", component_property="figure"),
        Output(component_id="CountyPlot", component_property="figure"),
    ],
    [
        Input(component_id="StateMap", component_property="clickData"),
        Input(component_id="USMapSelection", component_property="value"),
        Input(component_id="YearSelection", component_property="value"),
        Input(component_id="StateMapSelection", component_property="value"),
    ],
)
def updateOnStateClick(clickData, USSelection, yearSelection, stateSelection):
    df = stateData[stateSelection][yearSelection + "DF"]
    clickedCountyCode = df["county_code"][0]

    if (yearSelection == 20 and stateSelection == "Arkansas") or stateSelection == None:
        stateSelection = "Texas"

    if clickData != None:
        newCountyCode = clickData["points"][0]["location"]
        if getStateFromCode(newCountyCode) == stateSelection :
            clickedCountyCode=newCountyCode

    newCounty = df[
        (df["county_code"] == clickedCountyCode) & (df["Year"] == 2000 + int(yearSelection))
    ]
    clickedCounty = newCounty["county_name_x"].iloc[0]

    # Find the aggregated data from this state
    newCountyData = stateData[stateSelection]["AggregatedDF"]

    # Restrict the data to the county which was clicked
    newCountyData = newCountyData[newCountyData["county_code"] == clickedCountyCode]

    newCountyDataPlot = px.scatter(
        newCountyData, x="Year", y="Math Scores", width=400, height=300, range_x=[2018,2024], range_y=[0,800]
    )
    newCountyDataPlot.update_layout(
        plot_bgcolor = "#FFFFFF"
    )
    newCountyDataPlot.update_xaxes(gridcolor="#d4dadc")
    newCountyDataPlot.update_yaxes(gridcolor="#d4dadc")
    newCountyDataPlot.update_traces(marker=dict(color=plotMarkerColor))

    map = go.Figure(
        px.choropleth(
            df,
            geojson=createCountyGeo(stateData[stateSelection], clickedCounty),
            color=USSelection,
            locations="county_code",
            color_continuous_scale="Spectral",
        )
    )
    map.add_trace(
        go.Scattergeo(
            lat=newCounty["latitude"],
            lon=newCounty["longitude"],
            mode="markers",
            marker=dict(color=mapMarkerColor),
            name=""
        )
    )

    map.update_layout(
        geo_scope="usa",
    )
    map.update_geos(
        visible=False,
        resolution=50,
        fitbounds="locations",
    )
    map.update_layout(
        map_center=stateData[stateSelection]["StateCenter"],
        margin={"r": 0, "t": 0, "l": 0, "b": 0},
        width=300,
        height=200,
        dragmode=False,
        coloraxis_showscale=False,
    )

    return [clickedCounty, map, newCountyDataPlot]

In [7]:
app.run(port=8052)