In [38]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go 
from dash import Dash, dcc
import dash_ag_grid as dag

ne_states =[
    'CT', 'MA', 'ME', 'NH', 'NJ', 'NY', 'PA', 'RI', 'VT']
midwest_states = [
    'IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MS', 'ND', 'NE', 
    'OH', 'SD', 'WI']
southern_states = [
    'AL', 'AR', 'DC', 'DE', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'OK', 'SC',
    'TN', 'TX', 'VA', 'WV']
western_states = [
    'AK', 'AZ', 'CA', 'CO', 'HI', 'ID', 'MT', 'NM', 'NV',
    'OR', 'UT', 'WA', 'WY']
all_states = sorted(
    ne_states + midwest_states + southern_states + western_states)

print(f'{len(all_states) = }')
print(f'{all_states = }')

#-------------------------------------------------------------------------------
#  91.4% of data set entries are from the United States; 7.5% are from "No state
#  or country". England has the second highest number or entries for any country
#  0.26% of entries. This visualization will use USA data only. A dashboard 
#  similar to the sample code have dash_ag table, an overlay histogram with data
#  grouped by US regions east, central, mountain and pacific, and a choropleth 
#  choropleth map to showing the number of entries by US state.
#-------------------------------------------------------------------------------

# split long path name for PEP-8 compliance
source_file = 'https://raw.githubusercontent.com/plotly/Figure-Friday/refs/'
source_file += 'heads/main/2025/week-4/Post45_NEAData_Final.csv'

df = (
    pl.read_csv(source_file)
        .rename(
        {
            'family_name': 'LAST_NAME',
            'given_name_middle': 'FIRST_NAME',
            'us_state': 'STATE'
            }
        )
    .filter(pl.col('birth_year').is_not_null())
    .filter(pl.col('country') == 'USA')
    .filter(pl.col('STATE').is_in(all_states))
    .with_columns(
        WRITERS_AGE = (pl.col('nea_grant_year') - pl.col('birth_year'))
    )
    .with_columns(
        REGION = pl.when(pl.col('STATE').is_in(ne_states))
                    .then(pl.lit('NORTH_EAST'))
                    .when(pl.col('STATE').is_in(midwest_states))
                    .then(pl.lit('MID_WEST'))
                    .when(pl.col('STATE').is_in(southern_states))
                    .then(pl.lit('SOUTH'))
                    .when(pl.col('STATE').is_in(western_states))
                    .then(pl.lit('WEST'))
    )
    .with_columns(
        STATE_COUNT = pl.col('STATE').count().over('STATE'),
        REGION_COUNT = pl.col('REGION').count().over('REGION'),
        STATE_MEDIAN_AGE = pl.col('WRITERS_AGE').median().over('STATE'),
        REGION_MEDIAN_AGE = pl.col('WRITERS_AGE').median().over('REGION'),
    )
    # .with_columns(pl.col(['STATE_COUNT', 'REGION_COUNT']).drop_nulls())
    .select(
        pl.col(
            [
                'FIRST_NAME', 'LAST_NAME', 'WRITERS_AGE', 
                'STATE', 
                'STATE_COUNT', 
                'REGION', 
                'REGION_COUNT',
                'STATE_MEDIAN_AGE',
                'REGION_MEDIAN_AGE'
            ]
        )
    )
)

fig = px.scatter(
    df.sort('STATE_MEDIAN_AGE', descending=True),
    x='STATE',
    y='STATE_MEDIAN_AGE', 
    color='STATE',
    template='simple_white',
    height=500, width=800
   ) 
fig.update_traces(
    mode='lines+markers',
    line=dict(color='blue', width=5)
    # line_color='blue', 
    # width=5
    )
fig.update_traces(line_color='blue', selector=dict(type='scatter'))
fig.show()

fig = px.scatter(
    df.sort('REGION_MEDIAN_AGE', descending=True),
    x='REGION',
    y='REGION_MEDIAN_AGE', 
    color='REGION',
    template='simple_white',
    height=500, width=800
   ) 
fig.update_traces(
    mode='lines+markers+text',
    text='REGION'
    # line_color='blue', 
    # width=5
    )
fig.update_traces(line_color='blue', selector=dict(type='scatter'))
fig.show()

len(all_states) = 51
all_states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MS', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']


In [2]:
df['STATE'].value_counts().glimpse()

Rows: 50
Columns: 2
$ STATE <str> 'FL', 'KY', 'WI', 'OH', 'RI', 'AK', 'NE', 'SC', 'TX', 'PA'
$ count <u32> 51, 21, 37, 64, 21, 10, 12, 7, 89, 96



In [3]:
ne_states =[
    'CT', 'MA', 'ME', 'NH', 'NJ', 'NY', 'PA', 'RI', 'VT']
midwest_states = [
    'IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MS', 'ND', 'NE', 
    'OH', 'SD', 'WI']
southern_states = [
    'AL', 'AR', 'DE', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'OK', 'SC',
    'TN', 'TX', 'VA', 'WV']
western_states = [
    'AK', 'AZ', 'CA', 'CO', 'HI', 'ID', 'MT', 'NM', 'NV',
    'OR', 'UT', 'WA', 'WY']

print(f'{len(ne_states) = }')
print(f'{len(midwest_states) = }')
print(f'{len(southern_states) = }')
print(f'{len(western_states) = }')
print(f'{sorted(ne_states) = }')
print(f'{sorted(midwest_states) = }')
print(f'{sorted(southern_states) = }')
print(f'{sorted(western_states) = }')

len(ne_states) = 9
len(midwest_states) = 12
len(southern_states) = 16
len(western_states) = 13
sorted(ne_states) = ['CT', 'MA', 'ME', 'NH', 'NJ', 'NY', 'PA', 'RI', 'VT']
sorted(midwest_states) = ['IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MS', 'ND', 'NE', 'OH', 'SD', 'WI']
sorted(southern_states) = ['AL', 'AR', 'DE', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'OK', 'SC', 'TN', 'TX', 'VA', 'WV']
sorted(western_states) = ['AK', 'AZ', 'CA', 'CO', 'HI', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']
