In [22]:
import altair as alt
from vega_datasets import data
import requests
import geopandas as gpd
import json
import re
'''
Code to load, clean, and combine all relevant data.
See README for references re: data.
'''
import glob
import pandas as pd

#############
# CONSTANTS #
#############


# useful constants for accessing files
AGE_PATH_REGEX = 'clean/*_age.csv'
SEX_PATH_REGEX = 'clean/*_sexrace.csv'
LAWS_DATA_PATH = 'clean/supression.csv'

# useful constants for renaming and removing columns
AGE_COLUMN_NAMES = [
    'state', 'age_bracket', 'total', 'total_reg', 'percent_reg',
    'ci_reg', 'total_voted', 'percent_voted', 'ci_voted', 'yr'
]

SEX_COLUMN_NAMES = [
    'state', 'group', 'pop', 'total_cit', 'percent_cit', 'ci_cit',
    'total_reg', 'percent_reg', 'ci_reg', 'total_voted', 'percent_voted',
    'ci_voted', 'yr'
]

KEEP_AGE_COLUMNS = [
    'state', 'age_bracket', 'total', 'total_reg', 'total_voted', 
    'percent_reg', 'percent_voted', 'yr'
]

KEEP_SEX_COLUMNS = ['state', 'group', 'total_cit', 'total_reg', 'total_voted', 'yr']

# useful constants for standardizing state labels 
STATE_NAMES = ['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 
    'COLORADO', 'CONNECTICUT', 'DELAWARE', 'DISTRICT OF COLUMBIA', 'FLORIDA', 
    'GEORGIA', 'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 
    'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 
    'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA', 
    'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 'NEW YORK',
    'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON',
    'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 'SOUTH DAKOTA',
    'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA', 'WASHINGTON',
    'WEST VIRGINIA', 'WISCONSIN', 'WYOMING'
]

# note these integers are related to US Census Bureau ordering 
STATE_NUMS = [1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
    38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56,
]

STATES_TABLE = list(zip(STATE_NAMES, STATE_NUMS))


#############
# FUNCTIONS #
#############


def get_age_df(file_path):
    '''
    Load age data by file name, transform into pd.DataFrame and
    return DataFrame object with only relevant columns and cleaned
    values.

    Note: NaN values are kept for possible use in modeling/viz.

    Args:
        file_path: str, designating file to retrieve for a given year

    Returns:
        pd.DataFrame, processed age data for that year
    '''

    # access, label, and sanitize data
    df = pd.read_csv(file_path, header=0, names=AGE_COLUMN_NAMES)
    df = df[KEEP_AGE_COLUMNS]
    df.state = df.state.str.upper()
    df.yr = df.yr.astype(str)
    df.total = df.total.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.total_reg = df.total_reg.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.total_voted = df.total_voted.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.age_bracket = df.age_bracket.map(lambda x: x.lstrip('.'))
    return df


def get_sexrace_df(file_path):
    '''
    Load sexrace data by file name, transform into pd.DataFrame
    and return DataFrame object with only relevant columns and cleaned
    values.

    Note: NaN values are kept for possible use in modeling/viz.

    Args:
        file_path: str, file for which to retrieve sexrace data

    Returns:
        pd.DataFrame, processed sexrace data for that year
    '''

    # access, label, and sanitize data
    df = pd.read_csv(file_path, header=0, names=SEX_COLUMN_NAMES)
    df = df[KEEP_SEX_COLUMNS]
    df.state = df.state.str.upper()
    df.yr = df.yr.astype(str)
    df.total_cit = df.total_cit.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.total_reg = df.total_reg.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.total_voted = df.total_voted.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df.group = df.group.map(lambda x: x.lstrip('.'))
    return df


def combine_age_data():
    '''
    Retrieve all age-related data files, combine into one pd.DataFrame
    object, and attach legislative data columns.

    Args:   None

    Returns:
        pd.DataFrame, combined age data for all years
    '''

    # retrieve all relevant file paths
    age_file_paths = glob.glob(AGE_PATH_REGEX)
    df_list = []

    # iterative store pd.DataFrame representation of each
    for age_file in age_file_paths:
        print('Reading %s...' % age_file)
        df = get_age_df(age_file)
        df_list.append(df)

    # concatenate and attach legislative data
    combined = pd.concat(df_list, axis=0, ignore_index=True)
    laws_df = pd.read_csv(LAWS_DATA_PATH)
    laws_df.STATE = laws_df.STATE.str.upper()

    result_df = combined.merge(laws_df,
                               how='outer',
                               left_on='state',
                               right_on='state')
    
    # make nationwide labels consistent and finish 
    result_df.state.loc[result_df.state == 'US'] = 'NATIONAL'
    result_df.state.loc[result_df.state == 'UNITED STATES'] = 'NATIONAL'
    return result_df


def combine_sexrace_data():
    '''
    Retrieve all sexrace-related data files, combine into one
    pd.DataFrame object, and attach legislative data columns.

    Args:   None

    Returns:
        pd.DataFrame, combined sexrace data for all years
    '''

    # retrieve all relevant file paths
    sex_file_paths = glob.glob(SEX_PATH_REGEX)
    df_list = []

    # iterative store pd.DataFrame representation of each
    for sex_file in sex_file_paths:
        print('Reading %s...' % sex_file)
        df = get_sexrace_df(sex_file)
        df_list.append(df)

    # concatenate and attach legislative data
    combined = pd.concat(df_list, axis=0, ignore_index=True)
    laws_df = pd.read_csv(LAWS_DATA_PATH)
    laws_df.STATE = laws_df.STATE.str.upper()

    result_df = combined.merge(laws_df,
                               how='outer',
                               left_on='state',
                               right_on='state')
    
    # make nationwide labels consistent and finish 
    result_df.state.loc[result_df.state == 'US'] = 'NATIONAL'
    result_df.state.loc[result_df.state == 'UNITED STATES'] = 'NATIONAL'
    return result_df


def homogenize_age_data(df_in):
    """
    Structures the age data by creating the desired age groups
    of 'Total','18 to 44', '45 to 65', '65+' into a DataFrame.

    Args:
        df_in: Dataframe created by function combine_age_data()

    Returns:
        pd.DataFrame, age bracket structured data for all years
    """
    df_states = pd.DataFrame(STATES_TABLE, columns=['state', 'id'])
    df_states.state = df_states.state.str.capitalize()

    # splitting inconsistent age brackets into many DFs
    df = df_in.copy()
    df_65plus = df.loc[(df.age_bracket == '65 to 74')
        | (df.age_bracket == '75+')
        | (df.age_bracket == '65 to 75')
        | (df.age_bracket == '65+'),
    ]

    df_45_64 = df.loc[(df.age_bracket == '45 to 64')
        | (df.age_bracket == '45 to 55')
        | (df.age_bracket == '55 to 65')
        | (df.age_bracket == '45 to 65'),
    ]

    df_18_44 = df.loc[(df.age_bracket == '18 to 24')
        | (df.age_bracket == '18 to 25')
        | (df.age_bracket == '25 to 44')
        | (df.age_bracket == '25 to 35')
        | (df.age_bracket == '35 to 45')
        | (df.age_bracket == '25 to 45')
        | (df.age_bracket == '25 to 34')
        | (df.age_bracket == '35 to 44'),
    ]

    df_total = df.loc[df.age_bracket == 'Total',]

    # iteratively group these DFs by state and year
    df_list = [df_total, df_18_44, df_45_64, df_65plus]
    age_brackets = ['Total','18 to 44', '45 to 65', '65+']
    result = []

    for i, df in enumerate(df_list):
        df = df.groupby(['state', 'yr'], sort=False).sum().reset_index()
        df['age_bracket'] = age_brackets[i]
        result.append(df)

    # recombine all age bracket DFs
    result = pd.concat(result, axis=0, ignore_index=True)
    result.sort_values(['yr', 'state'], inplace=True)

    # compute voter turnout metric
    result['percent_reg'] = result.total_reg / result.total
    result['percent_voted'] = result.total_voted / result.total

    # refomatting
    result.yr = result.yr.astype(int)
    result.state = result.state.str.capitalize()
    result = result.rename(columns={'age_bracket':'group'})

    # attach our state labels/IDs and finish
    result = pd.merge(result, df_states, left_on='state', right_on='state')
    return result


def homogenize_sexrace_data(df_in):
    """
    Structures the age data by creating the desired age groups
    of 'Total','18 to 44', '45 to 65', '65+' into a DataFrame.

    Args:
        df_in: Dataframe created by function combine_age_data()

    Returns:
        pd.DataFrame, age bracket structured data for all years
    """
    df_states = pd.DataFrame(STATES_TABLE, columns=['state', 'id'])
    df_states.state = df_states.state.str.capitalize()
    
    # useful constants for renaming relevant demographic groups
    ORIGINAL_GROUPS = ['Total', 'Male', 'Female', 'N-H White','N-H Black',
                       'API', 'Hispanic', 'Non-Hispanic White', 'Non-Hispanic Black',
                       'Asian and Pacific Islander','White non-Hispanic alone',
                       'Black alone', 'Asian alone','Hispanic (of any race)']
    RENAME_GROUPS = ['Total', 'Male', 'Female', 'White', 'Black',
                     'Asian & Pacific Islander','Hispanic', 'White',
                     'Black', 'Asian & Pacific Islander','White','Black',
                     'Asian & Pacific Islander','Hispanic']
    
    # useful constants for the 'total's columns to work with
    TOTALS_COLUMNS = ['total_cit', 'total_reg', 'total_voted']

    # interatively renaming demographic groups
    df = df_in.copy()
    for (og, rm) in zip(ORIGINAL_GROUPS, RENAME_GROUPS):
        df.group.loc[df.group == og] = rm

    # keeping relevant groups and setting years as str
    df_groups_kept = df.loc[df.group.isin(RENAME_GROUPS)]
    df_groups_kept.yr = df_groups_kept.yr.astype(float).astype(int).astype(str)

    # interatively validating the 'total's values 
    for col in TOTALS_COLUMNS:
        df_temp = df_groups_kept.pivot_table(index=['state','yr'], columns='group', values=col)
        df_temp = df_temp.reset_index()
        totals = df_temp[['Male', 'Female']].sum(axis=1)
        df_temp.Total = totals
        df_temp_unpivot = df_temp.melt(id_vars=['state','yr'], value_name=col)
        if col == 'total_cit':
            df_merge = df_temp_unpivot.copy()
        else:
            next
        df_merge = pd.merge(df_merge, df_temp_unpivot, how='left',
                           left_on=['state', 'yr', 'group'], right_on=['state', 'yr', 'group'])

    laws_df = pd.read_csv(LAWS_DATA_PATH)
    laws_df.state = laws_df.state.str.upper()

    results = df_merge.merge(laws_df,
                               how='outer',
                               left_on='state',
                               right_on='state')
    # reformating values and column names
    df_merge_kept = results.drop('total_cit_y', axis=1)
    df_merge_kept.state = df_merge_kept.state.str.capitalize()
    df_demo_out = df_merge_kept.rename(columns={"total_cit_x":'total'})
    df_demo_out = df_demo_out.sort_values(by=['yr', 'state']).round()
    df_demo_out.yr = df_demo_out.yr.astype(int)
    
    # calculating the percentage of voter turnout totals
    df_demo_out['percent_reg'] = df_demo_out.total_reg / df_demo_out.total
    df_demo_out['percent_voted'] = df_demo_out.total_voted / df_demo_out.total
        
    # attach our state labels/IDs and finish
    df_demo_out = pd.merge(df_demo_out, df_states, left_on='state', right_on='state')
    
    return df_demo_out




def get_json_shape(url='https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json'):
    """
    Download the json data from the given url link. Specific to US continental geoshapes.
    
    Args:
        url (optional): url link to json file
    Returns:
        json structure
    """
    req = requests.get(url)
    return req.json()

def create_geodataframe(df_in):
    """
    Creates a GeoPandas dataframe with the geo location and shapes
    of US States merged with the povided datafram of state data.
    
    Args:
        df_in: pandas dataframe, must have 'state' column for US States
    Returns:
        Geopandas dataframe
    """
    # get the shape of the states
    states_json = get_json_shape()

    # set Geopandas dataframe
    gpd_states = gpd.GeoDataFrame.from_features(states_json)

    # make names of states all uppper case
    gpd_states['NAME'] = gpd_states['NAME'].str.upper()

    gpd_states['centroid_lon'] = gpd_states['geometry'].centroid.x
    gpd_states['centroid_lat'] = gpd_states['geometry'].centroid.y
    
    # merge states data and states geopandas dfs
    gpd_states.NAME = gpd_states.NAME.str.capitalize()
    states_merged = pd.merge(df_in, gpd_states, left_on='state', right_on='NAME')
    
    # convert to GeoPandas Dataframe
    gpd_merged = gpd.GeoDataFrame(states_merged)
    return gpd_merged

def to_geojson(gpd_df_in):
    """
    Converts GeoPandas dataframe to GeoJson to be used in Altair Viz
    
    Args:
        gpd_df_in: GeoPandas dataframe
    Returns:
        Altair json data structure
    """
    #convert back to GeoJson to plot in altair
    choro_json = json.loads(gpd_df_in.to_json())
    choro_data = alt.Data(values=choro_json['features'])
    return choro_data

In [7]:
import altair as alt
from vega_datasets import data
import requests
import geopandas as gpd
import json
import re

# A dropdown filter
categories_age = ['Total', '18 to 44', '45 to 65', '65+']
catage_dropdown = alt.binding_select(options=categories_age)
cat_select_age = alt.selection_single(fields=['group'],
                                  bind=catage_dropdown,
                                  name="Demographic",
                                  init={'group':'Total'})
categories_demo = ['Total', 'Male', 'Female', 'White', 'Black',
                     'Asian & Pacific Islander','Hispanic']
catdemo_dropdown = alt.binding_select(options=categories_demo)
cat_select_demo = alt.selection_single(fields=['group'],
                                  bind=catdemo_dropdown,
                                  name="Demographic",
                                  init={'group':'Total'})


# A slider filter
slider = alt.binding_range(min=2000, max=2018, step=2, name='Election Year')
select_yr = alt.selection_single(name='SelectorName', fields=['yr'],
                                   bind=slider, init={'yr': 2000})



def us_map_chart(df_in, map_value, map_title,selection_link=select_yr):
    df = df_in.copy()
    PIVOT_COLUMNS = ['state','id','group','yr']
    columns_keep = PIVOT_COLUMNS + [map_value]
    year_columns = [str(year) for year in range(2000, 2019, 2)]
    
    df_pivot = df[columns_keep].pivot_table(index=['id','state','group'], 
                                                                             columns='yr', values=map_value)
    mapdf = df_pivot.reset_index()
    mapdf.columns = mapdf.columns.astype(str)
    
    states = alt.topo_feature(data.us_10m.url, 'states')
    states['url'] = 'https://raw.githubusercontent.com/vega/vega/master/docs/data/us-10m.json'

    map_chart=alt.Chart(states).mark_geoshape(
    stroke='black',
    strokeWidth=0.05
    ).project(
        type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(mapdf.loc[mapdf.group=='Total'], 'id', ['state']+year_columns)
    ).transform_fold(
        year_columns, as_=['yr', 'Percent']
    ).transform_calculate(
        yr='parseInt(datum.yr)',
        Percent='isValid(datum.Percent) ? datum.Percent : -1'  
    ).encode(
        tooltip=['state:N','Percent:Q'],
        color = alt.condition(
            'datum.Percent > 0',
            alt.Color('Percent:Q', scale=alt.Scale(domain=[0.2,.9],scheme='yellowgreenblue', type='linear')),
            alt.value('#dbe9f6')
        )).add_selection(
        selection_link
    ).properties(
        title=map_title,
        width=300,
        height=200
    ).transform_filter(
        selection_link
    )
    
    return map_chart

def scatter_turnout(df_in, x_value, y_value, color_variable, title, x_title, y_title,
                    select_slider, select_dropdown):
    
    # A selection for interval highlighing on charts
    highlight = alt.selection_interval(encodings=['x'])
    color = alt.Color(color_variable)
    click = alt.selection_multi(encodings=['color'])
    
    df = df_in.copy()
    
    scatter = alt.Chart().mark_point().encode(
        x=alt.X(x_value, title=x_title,
               scale=alt.Scale(domain=[.15, .96])),
        y=alt.Y(y_value, title=y_title,
               scale=alt.Scale(domain=[.15, .93])),
        size=alt.Size('total:Q', title='Total Eligible Voters'),
        color=alt.condition(highlight, color_variable, alt.value('lightgray'), legend=None),
        tooltip=[alt.Tooltip('state:N', title='State'),
                 alt.Tooltip('total:Q', title='Total Eligible Voters'),
                 alt.Tooltip('total_reg:Q', title='Percent Registered Voters'),
                 alt.Tooltip('total_voted:Q', title='Percent Voted')]
    ).add_selection(
        select_slider
    ).transform_filter(
        select_slider
    ).add_selection(
        select_dropdown
    ).transform_filter(
        select_dropdown
    ).add_selection(
        highlight
    ).transform_filter(
        click
    ).properties(
        width=500,
        height=275
    )
    
    
    bars = alt.Chart().mark_bar().encode(
        x='count()',
        y=alt.Y(color_variable, title='Restrictive Laws'),
        color=alt.condition(click, color, alt.value('lightgray'))
    ).transform_filter(
        highlight
    ).transform_filter(
        select_yr
    ).transform_filter(
        select_dropdown
    ).properties(
        width=500,
        height=100
    ).add_selection(
        click
    )

    return alt.vconcat(scatter, bars, data=df, title=title)

In [11]:
import pandas as pd
import glob

#############
# CONSTANTS #
#############

# filepath expressions for data
AGE_FILES = 'clean/*_age.csv'
SEXRACE_FILES = 'clean/*_sexrace.csv'
LAWS_DATA_PATH = 'clean/supression.csv'

'''# useful constants for renaming and removing columns
AGE_COLUMN_NAMES = [
    'state', 'age_bracket', 'total', 'total_reg', 'percent_reg',
    'ci_reg', 'total_voted', 'percent_voted', 'ci_voted', 'yr'
]

SEX_COLUMN_NAMES = [
    'STATE', 'Group', 'Population (18+)', 'Total Citizen', 'Percent Citizen', 'CI Citizen',
    'Total Registered', 'Percent Registered (18+)', 'CI Registered', 'Total Voted', 'Percent Voted (18+)',
    'CI Voted', 'Year'
]'''

# desired columns to subset from data
KEEP_AGE_COLUMNS = [
    'STATE', 'Age', 'Total', 'Total Registered', 'Percent registered (18+)', 
    'CI Registered', 'Total Voted', 'Percent voted (18+)', 'CI Voted', 'Year'
]

KEEP_SEX_COLUMNS = [
    'STATE', 'Group', 'Population (18+)', 'Total Citizen', 'Percent Citizen', 'CI Citizen',
    'Total Registered', 'Percent Registered (18+)', 'CI Registered', 'Total Voted', 'Percent Voted (18+)',
    'CI Voted', 'Year'
]

# desired state labels
STATE_NAMES = ['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 
    'COLORADO', 'CONNECTICUT', 'DELAWARE', 'DISTRICT OF COLUMBIA', 'FLORIDA', 
    'GEORGIA', 'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 
    'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 
    'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA', 
    'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 'NEW YORK',
    'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON',
    'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 'SOUTH DAKOTA',
    'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA', 'WASHINGTON',
    'WEST VIRGINIA', 'WISCONSIN', 'WYOMING', 'NATIONAL'
]

# note these integers are related to US Census Bureau ordering 
STATE_NUMS = [1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
    38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 0
]

STATES_TABLE = list(zip(STATE_NAMES, STATE_NUMS))


#############
# FUNCTIONS #
#############


def get_age_df(file_path):
    '''
    Load age data by file name into pd.DataFrame and
    return DataFrame object with select columns and cleaned
    values.

    Note: NaN values are kept for possible use in modeling/viz.

    Args:
        file_path: str, designating file to retrieve for a given year

    Returns:
        pd.DataFrame, processed age data for that year
    '''

    # load and subset data
    df = pd.read_csv(file_path, header=0)
    df = df[KEEP_AGE_COLUMNS]

    # clean up format and unwanted punctuation
    df["STATE"] = df["STATE"].str.upper()
    df["Year"] = df["Year"].astype(str)
    df['Total'] = df['Total'].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df['Total Registered'] = df['Total Registered'].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df['Total Voted'] = df['Total Voted'].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df['Age'] = df['Age'].map(lambda x: x.lstrip('.'))

    return df


def get_sexrace_df(file_path):
    '''
    Load sexrace data by file name into pd.DataFrame
    and return DataFrame object with select columns and cleaned
    values.

    Note: NaN values are kept for possible use in modeling/viz.

    Args:
        file_path: str, file for which to retrieve sexrace data

    Returns:
        pd.DataFrame, processed sexrace data for that year
    '''

    # load and subset data
    df = pd.read_csv(file_path, header=0)
    df = df[KEEP_SEX_COLUMNS]

    # clean up format and unwanted punctuation
    df["STATE"] = df["STATE"].str.upper()
    df["Year"] = df["Year"].astype(str)
    df["Total Citizen"] = df["Total Citizen"].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df["Total Registered"] = df["Total Registered"].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df["Total Voted"] = df["Total Voted"].replace(',','', regex=True).apply(pd.to_numeric, errors='coerce')
    df["Group"] = df["Group"].map(lambda x: x.lstrip('.'))

    return df


def combine_age_data(file_expression=AGE_FILES, law_filepath=LAWS_DATA_PATH):
    '''
    Generate all age-related dataframes, combine into one, 
    and attach legislative data columns.

    Args:
        file_expression: str, regex to capture desired age files
        law_filepath: str, filepath to legislation data

    Returns:
        pd.DataFrame, combined age data for all years
    '''

    # retrieve all relevant file paths
    age_file_paths = glob.glob(file_expression)
    df_list = []

    # generate a dataframe from each file and combine
    for age_file in age_file_paths:
        print('Reading %s...' % age_file)
        df = get_age_df(age_file)
        df_list.append(df)
    combined = pd.concat(df_list, axis=0, ignore_index=True)

    # load legislative data
    laws_df = pd.read_csv(law_filepath)
    laws_df["STATE"] = laws_df["STATE"].str.upper()

    # make nationwide labels consistent
    combined["STATE"].loc[combined["STATE"] == 'US'] = 'NATIONAL'
    combined["STATE"].loc[combined["STATE"] == 'UNITED STATES'] = 'NATIONAL'
    laws_df["STATE"].loc[laws_df["STATE"] == 'US'] = 'NATIONAL'

    # attach legislative rating to age data
    result_df = combined.merge(laws_df,
                               how='outer',
                               left_on='STATE',
                               right_on='STATE')

    return result_df


def combine_sexrace_data(file_expression=SEXRACE_FILES, law_filepath=LAWS_DATA_PATH):
    '''
    Retrieve all sexrace-related data files, combine into one
    pd.DataFrame object, and attach legislative data columns.

    Args:
        file_expression: str, regex to capture desired age files
        law_filepath: str, filepath to legislation data

    Returns:
        pd.DataFrame, combined sexrace data for all years
    '''

    # retrieve all relevant file paths
    sex_file_paths = glob.glob(file_expression)
    df_list = []

    # generate a dataframe from each file and combine
    for sex_file in sex_file_paths:
        print('Reading %s...' % sex_file)
        df = get_sexrace_df(sex_file)
        df_list.append(df)
    combined = pd.concat(df_list, axis=0, ignore_index=True)

    # load legislative data
    laws_df = pd.read_csv(law_filepath)
    laws_df["STATE"] = laws_df["STATE"].str.upper()

    # make nationwide labels consistent
    combined["STATE"].loc[combined["STATE"] == 'US'] = 'NATIONAL'
    combined["STATE"].loc[combined["STATE"] == 'UNITED STATES'] = 'NATIONAL'
    laws_df["STATE"].loc[laws_df["STATE"] == 'US'] = 'NATIONAL'

    # attach legislative rating to sex/race data
    result_df = combined.merge(laws_df,
                               how='outer',
                               left_on='STATE',
                               right_on='STATE')
    
    return result_df


def homogenize_age_data(df):
    """
    Structures the age data by creating the desired age groups
    of 'Total','18 to 44', '45 to 65', '65+' into a DataFrame.

    Args:
        df: Dataframe created by function combine_age_data()

    Returns:
        pd.DataFrame, age bracket structured data for all years
    """
    # set up a table for the states
    df_states = pd.DataFrame(STATES_TABLE, columns=['STATE', 'id'])
    df_states["STATE"] = df_states["STATE"].str.upper()

    # combine existing age brackets for uniformity
    df_65plus = df.loc[(df["Age"] == '65 to 74')
        | (df["Age"] == '75+')
        | (df["Age"] == '65 to 75')
        | (df["Age"] == '65+'),
    ]
    df_45_64 = df.loc[(df["Age"] == '45 to 64')
        | (df["Age"] == '45 to 55')
        | (df["Age"] == '55 to 65')
        | (df["Age"] == '45 to 65'),
    ]
    df_18_44 = df.loc[(df["Age"] == '18 to 24')
        | (df["Age"] == '18 to 25')
        | (df["Age"] == '25 to 44')
        | (df["Age"] == '25 to 35')
        | (df["Age"] == '35 to 45')
        | (df["Age"] == '25 to 45')
        | (df["Age"] == '25 to 34')
        | (df["Age"] == '35 to 44'),
    ]
    df_total = df.loc[df["Age"] == 'Total',]

    # iteratively group these DFs by state and year
    df_list = [df_total, df_18_44, df_45_64, df_65plus]
    age_brackets = ['Total','18 to 44', '45 to 65', '65+']
    combined = []

    for i, df in enumerate(df_list):
        df = df.groupby(['STATE', 'Year'], sort=False).sum().reset_index()
        df['Age'] = age_brackets[i]
        combined.append(df)

    # recombine all age bracket DFs
    result = pd.concat(combined, axis=0, ignore_index=True)
    result.sort_values(['Year', 'STATE'], inplace=True)

    # compute voter turnout metric
    result['percent_reg'] = result['Total Registered'] / result['Total']
    result['percent_voted'] = result['Total Voted'] / result['Total']

    # refomatting
    result['Year'] = result['Year'].astype(int)
    result['STATE'] = result['STATE'].str.upper()

    # attach our state labels/IDs
    result = df_states.merge(result, how='outer', left_on='STATE', right_on='STATE')
    return result

In [23]:
sexracedf = combine_sexrace_data()
sexrace_dataframe = homogenize_sexrace_data(sexracedf)

Reading clean/2000_sexrace.csv...
Reading clean/2002_sexrace.csv...
Reading clean/2004_sexrace.csv...
Reading clean/2006_sexrace.csv...
Reading clean/2008_sexrace.csv...
Reading clean/2010_sexrace.csv...
Reading clean/2012_sexrace.csv...
Reading clean/2014_sexrace.csv...
Reading clean/2016_sexrace.csv...
Reading clean/2018_sexrace.csv...


KeyError: 'state'

In [19]:
agedf = combine_age_data()
age_dataframe = homogenize_age_data(agedf)

Reading clean/2000_age.csv...
Reading clean/2002_age.csv...
Reading clean/2004_age.csv...
Reading clean/2008_age.csv...
Reading clean/2010_age.csv...
Reading clean/2012_age.csv...
Reading clean/2014_age.csv...
Reading clean/2016_age.csv...
Reading clean/2018_age.csv...


FileNotFoundError: [Errno 2] File clean/suppression.csv does not exist: 'clean/suppression.csv'

In [None]:
sexrace_dataframe

In [13]:
age_dataframe

Unnamed: 0,STATE,id,Year,Total,Total Registered,Total Voted,restrictive_id_laws,felony_disenfranchisement,Age,percent_reg,percent_voted
0,ALABAMA,1,2000,3233.0,2411,1953,1.0,4.0,Total,0.745747,0.604083
1,ALABAMA,1,2000,1676.0,1165,927,2.0,8.0,18 to 44,0.695107,0.553103
2,ALABAMA,1,2000,1056.0,834,701,1.0,4.0,45 to 65,0.789773,0.663826
3,ALABAMA,1,2000,501.0,413,325,2.0,8.0,65+,0.824351,0.648703
4,ALABAMA,1,2002,3215.0,2347,1585,1.0,4.0,Total,0.730016,0.493002
...,...,...,...,...,...,...,...,...,...,...,...
1846,NATIONAL,0,2016,48684.0,36667,33314,0.0,0.0,65+,0.753163,0.684291
1847,NATIONAL,0,2018,249748.0,153066,122281,0.0,0.0,Total,0.612882,0.489618
1848,NATIONAL,0,2018,114546.0,59966,43312,0.0,0.0,18 to 44,0.523510,0.378119
1849,NATIONAL,0,2018,83277.0,55032,45829,0.0,0.0,45 to 65,0.660831,0.550320


In [None]:
us_map_chart(sexrace_dataframe, map_value='percent_voted',
             map_title='Percent Voted')

In [None]:
scatter_turnout(df_in=sexrace_dataframe,x_value='percent_reg:Q',y_value='percent_voted:Q',
               color_variable='restrictive_id_laws:N',title='',
               y_title='Percent Voted', x_title='Percent Registered', select_slider=select_yr,
                select_dropdown=cat_select_demo)

In [14]:
map1 = us_map_chart(age_dataframe, map_value='percent_voted',
             map_title='Percent Voted')

KeyError: "['state', 'group', 'yr'] not in index"

In [None]:
map2 = us_map_chart(age_dataframe, map_value='percent_reg',
             map_title='Percent Registered')

In [None]:
map1 & map2

In [None]:
turnout = scatter_turnout(df_in=age_dataframe,x_value='percent_reg:Q',y_value='percent_voted:Q',
               color_variable='restrictive_id_laws:N',title='',
               y_title='Percent Voted', x_title='Percent Registered', select_slider=select_yr,
                         select_dropdown=cat_select_age)

In [None]:
turnout

In [None]:
alt.hconcat(turnout, (map1 & map2), title='Voter Turnout by Age Group').save('Age_Turnout_Prototype.html')

In [None]:
color_variable='restrictive_id_laws:N'

highlight2 = alt.selection_interval(encodings=['x'])
color = alt.Color(color_variable)
click = alt.selection_multi(encodings=['color'])
legend_select = alt.selection_multi(fields=['total'], bind='legend')


alt.Chart(sexrace_dataframe).mark_point().encode(
        x=alt.X('percent_reg:Q', title='',
               scale=alt.Scale(domain=[.15, .96])),
        y=alt.Y('percent_voted:Q', title='',
               scale=alt.Scale(domain=[.15, .93])),
        size=alt.Size('total:Q', title='Total Eligible Voters'),
        color='restrictive_id_laws:N',
        opacity=alt.condition(legend_select, alt.value(2), alt.value(0.1)),
        tooltip=[alt.Tooltip('state:N', title='State'),
                 alt.Tooltip('total:Q', title='Total Eligible Voters'),
                 alt.Tooltip('total_reg:Q', title='Percent Registered Voters'),
                 alt.Tooltip('total_voted:Q', title='Percent Voted')]
    ).add_selection(
        select_yr
    ).transform_filter(
        select_yr
    ).properties(
        width=500,
        height=275
    ).add_selection(
        legend_select
    )

In [None]:
age_geo = create_geodataframe(age_dataframe)

In [None]:
age_geo

In [None]:
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).properties(
    width=500,
    height=300
).project('albersUsa')

points = alt.Chart(age_geo).mark_circle(
    size=10,
).encode(
    longitude='centroid_lon:Q',
    latitude='centroid_lat:Q',
    size=alt.Size('total_voted:Q', scale=alt.Scale(type='linear')),
    tooltip=['state:N','percent_voted:Q']
).project('albersUsa').add_selection(
        select_yr
    ).transform_filter(
        select_yr
    )

background + points


In [None]:
alt.Chart(states).mark_geoshape().encode(
    color='restrictive_id_laws:N'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(age_geo, 'id', ['restrictive_id_laws'])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300
)