In [2]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px



### EDA

In [3]:
data = pd.read_csv('./data/demographic_data.csv')
data

Unnamed: 0,County,State,State FIPS Code,County FIPS Code,FIPS,Total Population,Male Population,Female Population,Total Race Responses,White Alone,Black or African American Alone,Hispanic or Latino
0,Autauga County,Alabama,1,1,1001,59285,28669,30616,59285,43616,11829,2188
1,Baldwin County,Alabama,1,3,1003,239945,117316,122629,239945,198721,19144,13393
2,Barbour County,Alabama,1,5,1005,24757,12906,11851,24757,10891,11616,1490
3,Bibb County,Alabama,1,7,1007,22152,11824,10328,22152,16634,4587,744
4,Blount County,Alabama,1,9,1009,59292,29934,29358,59292,53062,747,5962
...,...,...,...,...,...,...,...,...,...,...,...,...
3217,Vega Baja Municipio,Puerto Rico,72,145,72145,54058,25765,28293,54058,13681,2249,53036
3218,Vieques Municipio,Puerto Rico,72,147,72147,8147,4178,3969,8147,1028,222,7803
3219,Villalba Municipio,Puerto Rico,72,149,72149,21778,10510,11268,21778,7552,2219,21700
3220,Yabucoa Municipio,Puerto Rico,72,151,72151,29868,14381,15487,29868,2001,5900,29732


In [4]:
state_data = data.groupby('State').agg({
  'State FIPS Code': 'first',
  'Total Population': 'sum',
  'Male Population': 'sum',
  'Female Population': 'sum',
  'White Alone': 'sum',
  'Black or African American Alone': 'sum',
  'Hispanic or Latino': 'sum'
}).reset_index()
state_data



Unnamed: 0,State,State FIPS Code,Total Population,Male Population,Female Population,White Alone,Black or African American Alone,Hispanic or Latino
0,Alabama,1,5054253,2453419,2600834,3303370,1318507,271640
1,Alaska,2,733971,385319,348652,445545,22774,52473
2,Arizona,4,7268175,3628694,3639481,4593653,336931,2255770
3,Arkansas,5,3032651,1495958,1536693,2148886,452127,265833
4,California,6,39242785,19605882,19636903,17248779,2173343,15630830
5,Colorado,8,5810774,2942568,2868206,4268784,232985,1291078
6,Connecticut,9,3598348,1765117,1833231,2431342,384753,640668
7,Delaware,10,1005872,487585,518287,621799,220645,107829
8,District of Columbia,11,672079,320001,352078,262549,290772,77760
9,Florida,12,21928881,10773620,11155261,13136701,3363769,5865737


In [5]:
state_data.dtypes

State                              object
State FIPS Code                     int64
Total Population                    int64
Male Population                     int64
Female Population                   int64
White Alone                         int64
Black or African American Alone     int64
Hispanic or Latino                  int64
dtype: object

In [6]:
state_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   State                            52 non-null     object
 1   State FIPS Code                  52 non-null     int64 
 2   Total Population                 52 non-null     int64 
 3   Male Population                  52 non-null     int64 
 4   Female Population                52 non-null     int64 
 5   White Alone                      52 non-null     int64 
 6   Black or African American Alone  52 non-null     int64 
 7   Hispanic or Latino               52 non-null     int64 
dtypes: int64(7), object(1)
memory usage: 3.4+ KB


In [7]:
state_data.describe

<bound method NDFrame.describe of                    State  State FIPS Code  Total Population  Male Population  \
0                Alabama                1           5054253          2453419   
1                 Alaska                2            733971           385319   
2                Arizona                4           7268175          3628694   
3               Arkansas                5           3032651          1495958   
4             California                6          39242785         19605882   
5               Colorado                8           5810774          2942568   
6            Connecticut                9           3598348          1765117   
7               Delaware               10           1005872           487585   
8   District of Columbia               11            672079           320001   
9                Florida               12          21928881         10773620   
10               Georgia               13          10822590          5281762   
11    

### CREATE HEATMAP

In [8]:
import pandas as pd
import plotly.express as px

data = pd.read_csv('./data/demographic_data.csv')

# Ensure formatted columns are numeric for the heatmap color mapping
formatted_columns = ['Total Population', 'Male Population', 'Female Population', 
                     'White Alone', 'Black or African American Alone', 'Hispanic or Latino']

for col in formatted_columns:
    state_data[col] = pd.to_numeric(state_data[col], errors='coerce').fillna(0).astype(int)

# Calculate Male and Female Population as a percentage of the Total Population
state_data['Male Population (%)'] = (state_data['Male Population'] / state_data['Total Population'] * 100).round(2)
state_data['Female Population (%)'] = (state_data['Female Population'] / state_data['Total Population'] * 100).round(2)

# Create a separate set of formatted hover data for better readability
for col in formatted_columns:
    state_data[f"{col} (Formatted)"] = state_data[col].apply(lambda x: f"{x:,}")

# Add formatted percentage columns for hover data
state_data['Male Population (%) (Formatted)'] = state_data['Male Population (%)'].apply(lambda x: f"{x}%")
state_data['Female Population (%) (Formatted)'] = state_data['Female Population (%)'].apply(lambda x: f"{x}%")

# Map state names to abbreviations
state_to_abbr = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
    "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL",
    "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA",
    "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI",
    "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND",
    "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA",
    "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD",
    "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI",
    "Wyoming": "WY"
}
state_data['State Abbreviation'] = state_data['State'].map(state_to_abbr)

# Create the heatmap
fig = px.choropleth(
    state_data,
    locations='State Abbreviation',  # Using abbreviations for location mapping
    locationmode='USA-states',
    color='Total Population',  # Numeric data for heatmap colors
    scope='usa',
    title='Heat Map of Total Population by State',
    color_continuous_scale='Viridis',
    hover_data={
        'State': True,
        'Total Population (Formatted)': True,
        'Male Population (%) (Formatted)': True,
        'Female Population (%) (Formatted)': True,
        'White Alone (Formatted)': True,
        'Black or African American Alone (Formatted)': True,
        'Hispanic or Latino (Formatted)': True,
    },
    labels={
        'State': 'State',
        'Total Population (Formatted)': 'Population',
        'Male Population (%) (Formatted)': 'Male Population (%)',
        'Female Population (%) (Formatted)': 'Female Population (%)',
        'White Alone (Formatted)': 'White Alone',
        'Black or African American Alone (Formatted)': 'Black or African American Alone',
        'Hispanic or Latino (Formatted)': 'Hispanic or Latino'
    }
)

fig.show(renderer='iframe')

In [9]:
total_male_population = state_data['Male Population'].sum()
total_female_population = state_data['Female Population'].sum()

# Prepare data for the pie chart
pie_data = {
    'Gender': ['Male', 'Female'],
    'Population': [total_male_population, total_female_population]
}

# Create the pie chart
fig = px.pie(
    pie_data,
    names='Gender',
    values='Population',
    title='Total Male Population vs Female Population in the USA',
    color_discrete_sequence=px.colors.sequential.Viridis
)

fig.update_traces(textinfo='percent+label')

fig.show(renderer='iframe')

In [10]:
# Helper function to get race data for a state or county
def get_race_data(state=None, county=None):
    if county:
        filtered_df = data[(data['State'] == state) & (data['County'] == county)]
    elif state:
        filtered_df = data[data['State'] == state]
    else:
        filtered_df = data

    race_totals = {
        'White Alone': filtered_df['White Alone'].sum(),
        'Black or African American Alone': filtered_df['Black or African American Alone'].sum(),
        'Hispanic or Latino': filtered_df['Hispanic or Latino'].sum(),
    }
    return list(race_totals.keys()), list(race_totals.values())

# Initialize the figure with data for the entire United States
initial_labels, initial_values = get_race_data()
fig = go.Figure(data=[go.Pie(labels=initial_labels, values=initial_values, name="All States")])

# Create a mapping of states to their county dropdowns
state_to_county_buttons = {}

# Generate dropdown options for counties based on the selected state
for state in data['State'].unique():
    state_labels, state_values = get_race_data(state=state)
    county_buttons = [
        dict(
            label="All Counties",
            method="update",
            args=[
                {"labels": [state_labels], "values": [state_values]},  # Update data
                {"title": f"Race per Population in {state} (All Counties)"}  # Update title
            ],
        )
    ]
    counties = data[data['State'] == state]['County'].unique()
    for county in counties:
        county_labels, county_values = get_race_data(state=state, county=county)
        county_buttons.append(
            dict(
                label=county,
                method="update",
                args=[
                    {"labels": [county_labels], "values": [county_values]},  # Update data
                    {"title": f"Race per Population in {county}, {state}"}  # Update title
                ],
            )
        )
    state_to_county_buttons[state] = county_buttons

# State dropdown
state_buttons = [
    dict(
        label="United States",
        method="update",
        args=[
            {"labels": [initial_labels], "values": [initial_values]},  # Update data
            {"title": "Race per Population in the United States"}  # Update title
        ],
    )
]

# Add state-level buttons
for state in data['State'].unique():
    state_buttons.append(
        dict(
            label=state,
            method="update",
            args=[
                {"labels": [state_to_county_buttons[state][0]['args'][0]['labels']],
                 "values": [state_to_county_buttons[state][0]['args'][0]['values']]},
                {"title": f"Race per Population in {state}"}
            ]
        )
    )

# Update layout with separate dropdowns
fig.update_layout(
    updatemenus=[
        # State dropdown
        dict(
            buttons=state_buttons,
            direction="down",
            showactive=True,
            x=0.2,  # Adjust position near the title
            xanchor="left",
            y=1.1,
            yanchor="top",
            pad={"r": 10, "t": 10},
            name="State",
        ),
        # County dropdown
        dict(
            buttons=state_to_county_buttons["California"],  # Default to California's counties
            direction="down",
            showactive=True,
            x=0.6,  # Adjust position near the state dropdown
            xanchor="left",
            y=1.1,
            yanchor="top",
            pad={"r": 10, "t": 10},
            name="County",
        ),
    ],
    title="Race per Population in the United States",
    title_x=0.27,  # Align title near dropdowns
)

fig.show(renderer='iframe')

### Conclusion

In [11]:
females_per_male = round(total_female_population / total_male_population, 2)
males_per_female = round(total_male_population / total_female_population, 2)
print(f"There is 1 male for every {females_per_male} females, and there is 1 female for every {males_per_female} males.")

There is 1 male for every 1.02 females, and there is 1 female for every 0.98 males.
