### Friday week 30 - Rural Investments, USA July 31, 2024 - EXPLORATORY DATA ANALYSIS

In [1]:
'''
Create horizontonal bar charts of rural investments by program and by program area
'''
import polars as pl   # dataframe library
import plotly.express as px

#------------------------------------------------------------------------------#
#     Functions                                                                #
#------------------------------------------------------------------------------#
def add_annotation(fig, annotation, x, y, align, xanchor, yanchor):
    fig.add_annotation(
        text=annotation,
        showarrow=False,
        xref = 'paper', x=x, yref = 'paper', y=y,
        align= align, xanchor=xanchor, yanchor=yanchor,
        font =  {'size': 14, 'color': 'darkslategray'}
    )
    return fig

def update_layout(
        fig, my_title, my_height, my_width,              # mandatory parameters
        my_xtitle='', my_ytitle='', my_legend_title='',  # optional parameters
        my_showlegend=False
    ):
    fig.update_layout(
        title = my_title,
        xaxis_title=my_xtitle,
        yaxis_title=my_ytitle,
        legend_title=my_legend_title,
        height=my_height, width=my_width,
        margin={"r":50, "t":50, "l":50, "b":50},
        autosize=False,
        showlegend=my_showlegend
    )
    return fig

def make_color_dict(my_categories):
    ''' map colors to categories ''' 
    my_colors = px.colors.qualitative.Dark24
    color_dict = dict(zip(sorted(list(set(my_categories))), my_colors))
    return color_dict
    
#------------------------------------------------------------------------------#
#     Create dataframes                                                        #
#------------------------------------------------------------------------------#
df_csv =  (  # df_csv is used for making both pareto dataframes
    pl.read_csv(
        'https://raw.githubusercontent.com/plotly/Figure-Friday/' +
        'main/2024/week-30/rural-investments.csv',
        ignore_errors = True
    )
    .with_columns(
        pl.col('Investment Dollars').str.replace_all(',', '').cast(pl.Float64),
    )
)

investment_by_program_area = (
    df_csv
    .group_by('Program Area')
    .agg(
        Count = pl.col('Program Area').count(),
        Total_Invest = pl.col('Investment Dollars').sum(),
        Avg_Invest = pl.col('Investment Dollars').mean(),
    )
    .with_columns(   # add space between the end of each tick label and the bar
        Program_Area = pl.col('Program Area') + '  '
    )
    .drop('Program Area')
    .rename({'Program_Area': 'Program Area'})
    .sort('Count', descending = True)
)
program_area_cm = make_color_dict(investment_by_program_area['Program Area'])


investment_by_program = (
    df_csv
    .group_by('Program')
    .agg(
        Count = pl.col('Program Area').count(),
        Total_Invest = pl.col('Investment Dollars').sum(),
        Avg_Invest = pl.col('Investment Dollars').mean(),
    )
    .sort('Count', descending = True)
    .head(12)   # taking top 12 Program Areas
    # Clean up/shorten program names
    .with_columns(
        Program = pl.when(pl.col('Program').str.starts_with('Rural Energy'))
                    .then(pl.lit('Rural Energy')).otherwise('Program')
    )
    .with_columns(
        Program = pl.when(pl.col('Program').str.starts_with('Business and Industry'))
                    .then(pl.lit('B & I Loan Guar. (BP)')).otherwise('Program')
    )
    .with_columns(
        Program = pl.when(pl.col('Program').str.starts_with('Higher Blends'))
                    .then(pl.lit('Higher Blends Infra. Incent.')).otherwise('Program')
    )
    .with_columns(
        Program = pl.when(pl.col('Program').str.starts_with('Rural Disaster'))
                    .then(pl.lit('Disaster Grants (SFH)')).otherwise('Program')
    )
    .with_columns(
        Program = pl.when(pl.col('Program').str.starts_with('Advanced Biofuel Payment'))
                    .then(pl.lit('Advanced Biofuel')).otherwise('Program')
    )
    .with_columns(Program = (pl.col('Program') + '  '))  # spaces added to labels
)
program_cm = make_color_dict(investment_by_program['Program'])


#------------------------------------------------------------------------------#
#     Pareto: data grouped by Program Area                                     #
#------------------------------------------------------------------------------#
print('\n\n\n\n\n')
fig = px.bar(
    investment_by_program_area.sort('Count', descending=True),
    x = 'Count', y = 'Program Area', 
    template='plotly_white',
    color='Program Area',
    color_discrete_map=program_area_cm,
    custom_data=['Count','Total_Invest','Avg_Invest'],
    orientation='h'
)

# add custom hover information
fig.update_traces(
    hovertemplate="<br>".join([
        'Investment Count %{customdata[0]:,d}',
        'Total $%{customdata[1]:,d}',
        'Average $%{customdata[2]:,d}',
        '<extra></extra>'
    ])
)
fig = update_layout(
    fig, 'USA Rural Investment by Program Area - 2024', 250, 1000, 
    my_ytitle= 'Program Area', 
    my_xtitle= 'Count',
    my_legend_title='Region',
    my_showlegend=False
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

annotation = '<b>Data Source:</b> US Department of Agriculture<br>'
add_annotation(fig, annotation, 0.9, 0.6, 'left', 'right', 'top')

# chart gets reordered after applying colors, next line restores intended order
fig.update_xaxes(categoryorder='total descending')

fig.show()
print('\n\n\n\n\n')

#------------------------------------------------------------------------------#
#     Pareto: data grouped by Program                                          #
#------------------------------------------------------------------------------#
fig = px.bar(
    investment_by_program.sort('Count', descending=True),
    x = 'Count', y = 'Program', 
    template='plotly_white',
    color='Program',
    color_discrete_map=program_cm,
    custom_data=['Count','Total_Invest','Avg_Invest'],
    orientation='h',
)

# add custom hover information
fig.update_traces(
    hovertemplate="<br>".join([
        'Investment Count %{customdata[0]:,d}',
        'Total $%{customdata[1]:,d}',
        'Average $%{customdata[2]:,d}',
        '<extra></extra>'
    ])
)
fig = update_layout(
    fig, 'USA Rural Investment by Program - 2024 (Top 12)', 400, 1000, 
    my_ytitle= 'Program', 
    my_xtitle= 'Count',
    my_legend_title='Region',
    my_showlegend=False
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

annotation = '<b>Data Source:</b> US Department of Agriculture<br>'
add_annotation(fig, annotation, 0.9, 0.6, 'left', 'right', 'top')

fig.show()


















In [2]:
#  Center text center 
my_banner = 'Create choropleth map'
print(f'{"#"}{"-"*78}{"#"}')
print(f'{"#"}{my_banner:^78}{"#"}')
print(f'{"#"}{"-"*78}{"#"}')

#------------------------------------------------------------------------------#
#                            Create choropleth map                             #
#------------------------------------------------------------------------------#


In [3]:
#  Center text left
my_banner = 'Clean up (shorten) long names'
print(f'{"#"}{"-"*78}{"#"}')
print(f'{"#"}{" "*5}{my_banner:<73}{"#"}')
print(f'{"#"}{"-"*78}{"#"}')

#------------------------------------------------------------------------------#
#     Clean up (shorten) long names                                            #
#------------------------------------------------------------------------------#
