In [2]:
import polars as pl
import plotly.express as px

source_data = 'nation.csv'
df = (
    pl.scan_csv(
        'nation.csv',
        ignore_errors=True, 
        skip_rows=1
    )
    .select(
        DAM = pl.col('Dam Name'),
        LAT = pl.col('Latitude'),
        LONG = pl.col('Longitude'),
        STATE = pl.col('State'),
        COUNTY = pl.col('County'),
        CITY = pl.col('City'),
        WATERWAY = pl.col('River or Stream Name'),
        YEAR_COMP = pl.col('Year Completed'),
        DECADE_COMP = pl.col('Year Completed Category'),

        # storage statisics for group_by aggregations
        VOL_CUB_YDS = pl.col('Volume (Cubic Yards)'),
        NID_CAP_ACR_FT = pl.col('NID Storage (Acre-Ft)'),
        MAX_STG_ACR_FT = pl.col('Max Storage (Acre-Ft)'),
        NORM_STG_ACR_FT = pl.col('Normal Storage (Acre-Ft)'),
        DRAINAGE_SQ_MILES = pl.col('Drainage Area (Sq Miles)'),
        SURF_AREA_SQM = pl.col('Surface Area (Acres)'),
        MAX_DISCHRG_CUB_FT_SEC = pl.col('Max Discharge (Cubic Ft/Second)'),

    )
    .filter(pl.col('MAX_STG_ACR_FT').is_not_null())
    .filter(pl.col('MAX_STG_ACR_FT') > 0.0)
    # .filter(pl.col('CITY').str.to_uppercase().str.contains('SAN J'))
    .collect()
)


state_list = sorted(df['STATE'].unique().to_list())
print(f'{len(state_list) = }')
print(f'{state_list = }')
df.glimpse()

len(state_list) = 52
state_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
Rows: 88924
Columns: 16
$ DAM                    <str> 'Aichi Forge Usa Dam', 'Oxy Usa Retention Pond', 'Dod Usa 4', 'Dod Usa 2', 'Girl Scouts of Usa Dam', 'Rlf/Pawnee Mine/Slurry Impoundment 2 Dam', 'Exxonmobil Coal Usa/Mine 2/Recirculation Lake Dam', 'Exxonmobil Coal Usa/Mine 2/Freshwater Lake Dam', 'Rlf/Pawnee Mine/

In [3]:
fig=px.scatter(
    df['STATE'].value_counts().sort('count'),
    'STATE',
    'count'
)
fig.show()

In [4]:
state ='California'
dam_info = [
    'DAM','LAT','LONG','STATE','COUNTY','CITY','WATERWAY','YEAR_COMP',
]
dam_stats = [
  'VOL_CUB_YDS', 'NID_CAP_ACR_FT', 'MAX_STG_ACR_FT', 'NORM_STG_ACR_FT',
 'DRAINAGE_SQ_MILES', 'SURF_AREA_SQM', 'MAX_DISCHRG_CUB_FT_SEC'
 ]
df_state = (
    df
    .filter(pl.col('STATE') == state)
    .select(['STATE'] + dam_stats)
)
state_dam_count = df_state.height
df_state_stats = (
    df_state.group_by('STATE').agg(pl.col(dam_stats).sum())
    .transpose(
        include_header=True,
        header_name='STATISTIC',
    )
    .rename({'column_0': 'TOTAL'})
    .filter(pl.col('STATISTIC') != 'STATE')
    .with_columns(pl.col('TOTAL').cast(pl.Float64))
    .with_columns(AVERAGE = pl.col('TOTAL')/state_dam_count)
)
df_state_stats

STATISTIC,TOTAL,AVERAGE
str,f64,f64
"""VOL_CUB_YDS""",844385988.0,577160.620643
"""NID_CAP_ACR_FT""",83616000.0,57153.823308
"""MAX_STG_ACR_FT""",83609000.0,57148.936774
"""NORM_STG_ACR_FT""",55405000.0,37870.993917
"""DRAINAGE_SQ_MILES""",177064.68,121.028489
"""SURF_AREA_SQM""",1053100.0,719.807546
"""MAX_DISCHRG_CUB_FT_SEC""",13724434.0,9381.021189


In [12]:
(
    df
    .with_columns(
        DAM = pl.col('DAM').str.split(' ').list.slice(0, 5).list.join(' ')
    )
)

DAM,LAT,LONG,STATE,COUNTY,CITY,WATERWAY,YEAR_COMP,DECADE_COMP,VOL_CUB_YDS,NID_CAP_ACR_FT,MAX_STG_ACR_FT,NORM_STG_ACR_FT,DRAINAGE_SQ_MILES,SURF_AREA_SQM,MAX_DISCHRG_CUB_FT_SEC
str,f64,f64,str,str,str,str,i64,str,i64,f64,f64,f64,f64,f64,i64
"""Aichi Forge Usa Dam""",38.28828,-84.55619,"""Kentucky""","""Scott""","""DELAPLAIN-AREA""","""TR-DRY RUN""",1974,"""1970-1979""",,86.8,86.8,26.6,1.35,8.0,
"""Oxy Usa Retention Pond""",47.156385,-102.770612,"""North Dakota""","""Dunn""",,,,"""Undetermined""",,682.6,682.6,,,,
"""Dod Usa 4""",35.671912,-95.187291,"""Oklahoma""","""Muskogee""","""BRAGGS""","""TR-SAND CREEK""",1940,"""1940-1949""",0,50.0,50.0,25.0,,,20
"""Dod Usa 2""",35.703409,-95.136375,"""Oklahoma""","""Muskogee""","""ARROWHEAD POINT""","""TR-WEST SPANIARD CREEK""",1939,"""1930-1939""",0,50.0,50.0,25.0,,5.0,200
"""Girl Scouts of Usa Dam""",35.2356,-81.265,"""North Carolina""","""Gaston""",,"""Crowders Creek""",1982,"""1980-1989""",,18.0,18.0,12.0,0.12,2.0,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Foster Dam""",44.415614,-122.670161,"""Oregon""","""Linn""","""SWEETHOME""","""SOUTH SANTIAM RIVER""",1967,"""1960-1969""",1806000,61000.0,61000.0,56000.0,494.0,1220.0,195000
"""Lockport Lock and Controlling …",41.568934,-88.078054,"""Illinois""","""Will""","""JOLIET""","""CHICAGO SANITARY & SHIP CANAL""",1933,"""1930-1939""",0,25000.0,25000.0,0.0,740.0,2112.0,36000
"""Arkabutla Dam""",34.759214,-90.123745,"""Mississippi""","""DeSoto""","""TUNICA""","""COLDWATER RIVER""",1943,"""1940-1949""",4700000,1.3838e6,1.3838e6,31500.0,1000.0,5100.0,111000
"""Grapevine Dam""",32.969859,-97.056211,"""Texas""","""Tarrant""","""CARROLLTON""","""DENTON CREEK""",1952,"""1950-1959""",7070000,788000.0,788000.0,181000.0,688.0,7380.0,191310
