In [39]:

import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import sqlite3
import numpy as np
import json
import os

In [6]:
conn = sqlite3.connect("/Users/joeportnoy/Desktop/repos/Money-Talks-A-Demographic-Story/outputs/bls_wage_data.db")


In [7]:
# Raw SQL to list tables
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = cursor.fetchall()

# Print the list of table names
print("Available tables in the database:")
for name in table_names:
    print(name[0])


Available tables in the database:
earnings


In [8]:
df = pd.read_sql_query("SELECT * FROM earnings", conn)
df.head()



Unnamed: 0,year,month,Sex,Marital_Status,Race,Native_Country,Industry,Occupation,Education_Enrollment,FT/PT_Enrollment,Education_Level_Attained,Household_Member_Status,Weekly_Earnings,Weekly_Earnings_Categories,PWSSWGT
0,2014,jan,Male,Divorced,White only,United States,State government,Transportation and material moving occupations,,,Bachelor's degree,Adult civilian household member,1620.0,1500-1999,3561.0809
1,2014,jan,Female,Widowed,White only,United States,State government,Office and administrative support occupations,,,High school graduate,Adult civilian household member,162.0,<250,3322.0487
2,2014,jan,Female,Never married,White only,United States,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,384.0,250-499,2671.0752
3,2014,jan,Female,Never married,Asian only,Thailand,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,320.0,250-499,3012.71
4,2014,jan,Male,Married - spouse present,White only,El Salvador,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,650.0,500-749,3598.1304


In [None]:
# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Replace empty strings with NaNs uniformly
df.replace('', pd.NA, inplace=True)

# Check for missing data
df.isnull().sum()

# Check for duplicates rows
df.duplicated().sum()


df.head()

Unnamed: 0,year,month,Sex,Marital_Status,Race,Native_Country,Industry,Occupation,Education_Enrollment,FT/PT_Enrollment,Education_Level_Attained,Household_Member_Status,Weekly_Earnings,Weekly_Earnings_Categories,PWSSWGT
0,2014,jan,Male,Divorced,White only,United States,State government,Transportation and material moving occupations,,,Bachelor's degree,Adult civilian household member,1620.0,1500-1999,3561.0809
1,2014,jan,Female,Widowed,White only,United States,State government,Office and administrative support occupations,,,High school graduate,Adult civilian household member,162.0,<250,3322.0487
2,2014,jan,Female,Never married,White only,United States,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,384.0,250-499,2671.0752
3,2014,jan,Female,Never married,Asian only,Thailand,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,320.0,250-499,3012.71
4,2014,jan,Male,Married - spouse present,White only,El Salvador,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,650.0,500-749,3598.1304


In [10]:
print(df.columns.tolist())


['year', 'month', 'Sex', 'Marital_Status', 'Race', 'Native_Country', 'Industry', 'Occupation', 'Education_Enrollment', 'FT/PT_Enrollment', 'Education_Level_Attained', 'Household_Member_Status', 'Weekly_Earnings', 'Weekly_Earnings_Categories', 'PWSSWGT']


In [11]:

# Convert earnings to numeric
df['Weekly_Earnings'] = pd.to_numeric(df['Weekly_Earnings'], errors='coerce')


# Display the first few rows
df.head()


Unnamed: 0,year,month,Sex,Marital_Status,Race,Native_Country,Industry,Occupation,Education_Enrollment,FT/PT_Enrollment,Education_Level_Attained,Household_Member_Status,Weekly_Earnings,Weekly_Earnings_Categories,PWSSWGT
0,2014,jan,Male,Divorced,White only,United States,State government,Transportation and material moving occupations,,,Bachelor's degree,Adult civilian household member,1620.0,1500-1999,3561.0809
1,2014,jan,Female,Widowed,White only,United States,State government,Office and administrative support occupations,,,High school graduate,Adult civilian household member,162.0,<250,3322.0487
2,2014,jan,Female,Never married,White only,United States,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,384.0,250-499,2671.0752
3,2014,jan,Female,Never married,Asian only,Thailand,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,320.0,250-499,3012.71
4,2014,jan,Male,Married - spouse present,White only,El Salvador,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,650.0,500-749,3598.1304


In [12]:


# Filter for women
women_df = df[df['Sex'].str.lower() == 'female']

# Weighted average for women
weighted_salary_women = (women_df['Weekly_Earnings'] * women_df['PWSSWGT']).sum() / women_df['PWSSWGT'].sum()

# Filter for men
men_df = df[df['Sex'].str.lower() == 'male']

# Weighted average for men
weighted_salary_men = (men_df['Weekly_Earnings'] * men_df['PWSSWGT']).sum() / men_df['PWSSWGT'].sum()

# Overall weighted salary
weighted_salary_all = (df['Weekly_Earnings'] * df['PWSSWGT']).sum() / df['PWSSWGT'].sum()

# Display
print(f"Weighted Avg Weekly Salary (Women): ${weighted_salary_women:.2f}")
print(f"Weighted Avg Weekly Salary (Men): ${weighted_salary_men:.2f}")
print(f"Weighted Avg Weekly Salary (All): ${weighted_salary_all:.2f}")



Weighted Avg Weekly Salary (Women): $919.77
Weighted Avg Weekly Salary (Men): $1198.62
Weighted Avg Weekly Salary (All): $1064.30


In [13]:
# Difference: men - women
gender_pay_gap = weighted_salary_men - weighted_salary_women

# Display it nicely
print(f"Men earn ${gender_pay_gap:.2f} more than women per week on average (weighted).")


Men earn $278.85 more than women per week on average (weighted).


In [14]:
percent_gap = (gender_pay_gap / weighted_salary_women) * 100

print(f"That's a {percent_gap:.2f}% pay gap in favor of men.")


That's a 30.32% pay gap in favor of men.


In [15]:
# Group by gender and calculate statistics on Weekly_Earnings
gender_stats = df.groupby('Sex')['Weekly_Earnings'].agg([
    'mean', 
    'median', 
    lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA
])

gender_stats.columns = ['Mean', 'Median', 'Mode']

df

Unnamed: 0,year,month,Sex,Marital_Status,Race,Native_Country,Industry,Occupation,Education_Enrollment,FT/PT_Enrollment,Education_Level_Attained,Household_Member_Status,Weekly_Earnings,Weekly_Earnings_Categories,PWSSWGT
0,2014,jan,Male,Divorced,White only,United States,State government,Transportation and material moving occupations,,,Bachelor's degree,Adult civilian household member,1620.0,1500-1999,3561.0809
1,2014,jan,Female,Widowed,White only,United States,State government,Office and administrative support occupations,,,High school graduate,Adult civilian household member,162.0,<250,3322.0487
2,2014,jan,Female,Never married,White only,United States,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,384.0,250-499,2671.0752
3,2014,jan,Female,Never married,Asian only,Thailand,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,320.0,250-499,3012.7100
4,2014,jan,Male,Married - spouse present,White only,El Salvador,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,650.0,500-749,3598.1304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659201,2025,mar,Male,Married - spouse present,White only,United States,State government,Building and grounds cleaning and maintenance ...,Not enrolled,,Associate degree - occupational,Adult civilian household member,2880.0,2000+,602.3902
1659202,2025,mar,Female,Married - spouse present,White only,United States,Federal government,Healthcare practitioner and technical occupations,,,Master's degree,Adult civilian household member,3080.0,2000+,4064.6185
1659203,2025,mar,Male,Never married,White only,Guatemala,State government,Production occupations,Enrolled,Full time,11th grade,Adult civilian household member,72.0,<250,5009.6817
1659204,2025,mar,Male,Married - spouse present,Black only,United States,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,674.0,500-749,3058.1813


In [16]:
# Average weekly earnings by education level (sorted high to low)
df.groupby('Education_Enrollment')['Weekly_Earnings'].mean().sort_values(ascending=False)

df

Unnamed: 0,year,month,Sex,Marital_Status,Race,Native_Country,Industry,Occupation,Education_Enrollment,FT/PT_Enrollment,Education_Level_Attained,Household_Member_Status,Weekly_Earnings,Weekly_Earnings_Categories,PWSSWGT
0,2014,jan,Male,Divorced,White only,United States,State government,Transportation and material moving occupations,,,Bachelor's degree,Adult civilian household member,1620.0,1500-1999,3561.0809
1,2014,jan,Female,Widowed,White only,United States,State government,Office and administrative support occupations,,,High school graduate,Adult civilian household member,162.0,<250,3322.0487
2,2014,jan,Female,Never married,White only,United States,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,384.0,250-499,2671.0752
3,2014,jan,Female,Never married,Asian only,Thailand,State government,Community and social service occupations,Enrolled,Full time,Master's degree,Adult civilian household member,320.0,250-499,3012.7100
4,2014,jan,Male,Married - spouse present,White only,El Salvador,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,650.0,500-749,3598.1304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659201,2025,mar,Male,Married - spouse present,White only,United States,State government,Building and grounds cleaning and maintenance ...,Not enrolled,,Associate degree - occupational,Adult civilian household member,2880.0,2000+,602.3902
1659202,2025,mar,Female,Married - spouse present,White only,United States,Federal government,Healthcare practitioner and technical occupations,,,Master's degree,Adult civilian household member,3080.0,2000+,4064.6185
1659203,2025,mar,Male,Never married,White only,Guatemala,State government,Production occupations,Enrolled,Full time,11th grade,Adult civilian household member,72.0,<250,5009.6817
1659204,2025,mar,Male,Married - spouse present,Black only,United States,State government,Transportation and material moving occupations,Not enrolled,,High school graduate,Adult civilian household member,674.0,500-749,3058.1813


In [17]:
# Group by both Sex and Education_Enrollment
education_enroll_stats = df.groupby(['Sex', 'Education_Level_Attained'])['Weekly_Earnings'].agg([
    'mean',
    'median',
    lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA
])
education_enroll_stats.columns = ['Mean', 'Median', 'Mode']
education_enroll_stats = education_enroll_stats.reset_index()
education_enroll_stats

Unnamed: 0,Sex,Education_Level_Attained,Mean,Median,Mode
0,Female,10th grade,328.181809,265.0,400.0
1,Female,11th grade,331.908053,270.0,300.0
2,Female,"12th grade, no diploma",447.437153,400.0,400.0
3,Female,1st-4th grade,450.504639,404.76,400.0
4,Female,5th-6th grade,445.850174,400.0,400.0
5,Female,7th-8th grade,421.401012,392.3,400.0
6,Female,9th grade,375.382009,336.0,400.0
7,Female,Associate degree - academic,804.375413,700.0,600.0
8,Female,Associate degree - occupational,766.590717,674.0,600.0
9,Female,Bachelor's degree,1142.449892,961.53,2884.61


In [26]:
education_enroll_stats.to_json('static/json/education_enrollment_wage.json')

In [18]:
# Average weekly earnings by occupation (top 10 highest paid)
df.groupby('Occupation')['Weekly_Earnings'].mean().sort_values(ascending=False).head(10)


Occupation
Legal occupations                                             1776.543337
Computer and mathematical occupations                         1707.519826
Architecture and engineering occupations                      1658.799285
Management occupations                                        1639.079652
Life, physical, and social science occupations                1476.870608
Business and financial operations occupations                 1460.925368
Healthcare practitioner and technical occupations             1318.087251
Arts, design, entertainment, sports, and media occupations    1147.051435
Education instruction and library occupations                 1055.164980
Installation, maintenance, and repair occupations             1050.005148
Name: Weekly_Earnings, dtype: float64

In [None]:
# Group by gender and occupation, and calculate mean, median, and mode for weighted weekly earnings

# Step 1: Drop rows with missing Weekly_Earnings or PWSSWGT
df_clean = df.dropna(subset=['Weekly_Earnings', 'PWSSWGT'])

# Step 2: Define weighted functions

def weighted_mean(group):
    return np.average(group['Weekly_Earnings'], weights=group['PWSSWGT'])

def weighted_median(group):
    sorted_group = group.sort_values('Weekly_Earnings')
    cumsum = sorted_group['PWSSWGT'].cumsum()
    cutoff = sorted_group['PWSSWGT'].sum() / 2
    return sorted_group.loc[cumsum >= cutoff, 'Weekly_Earnings'].iloc[0]

def weighted_mode(group):
    try:
        weights = group['PWSSWGT'].round().astype(int)
        repeated = group.loc[group.index.repeat(weights)]
        mode_series = repeated['Weekly_Earnings'].mode()
        return mode_series.iloc[0] if not mode_series.empty else pd.NA
    except:
        return pd.NA

# Step 3: Apply to groupby

occupation_stats = df_clean.groupby(['Sex', 'Occupation']).apply(
    lambda g: pd.Series({
        'Weighted_Mean': weighted_mean(g),
        'Weighted_Median': weighted_median(g),
        'Weighted_Mode': weighted_mode(g)
    })
).reset_index()

# Step 4: Preview the result

occupation_stats.head(10)




  occupation_stats = df_clean.groupby(['Sex', 'Occupation']).apply(


Unnamed: 0,Sex,Occupation,Weighted_Mean,Weighted_Median,Weighted_Mode
0,Female,Architecture and engineering occupations,1481.23981,1350.0,2884.61
1,Female,"Arts, design, entertainment, sports, and media...",1037.999851,870.0,2884.61
2,Female,Building and grounds cleaning and maintenance ...,463.595999,412.4,400.0
3,Female,Business and financial operations occupations,1341.284385,1153.84,2884.61
4,Female,Community and social service occupations,1029.497984,923.0,1000.0
5,Female,Computer and mathematical occupations,1543.0776,1384.0,2884.61
6,Female,Construction and extraction occupations,819.719865,700.0,600.0
7,Female,Education instruction and library occupations,1002.850587,884.61,1000.0
8,Female,"Farming, fishing, and forestry occupations",513.767046,464.0,400.0
9,Female,Food preparation and serving related occupations,419.812132,363.9,300.0


In [19]:
# Clean column names
df.columns = df.columns.str.strip()

# Helper functions
def weighted_mean(x):
    return (x['Weekly_Earnings'] * x['PWSSWGT']).sum() / x['PWSSWGT'].sum()

def weighted_median(data, weights):
    sorted_data, sorted_weights = zip(*sorted(zip(data, weights)))
    cum_weights = np.cumsum(sorted_weights)
    cutoff = sum(sorted_weights) / 2
    return sorted_data[np.searchsorted(cum_weights, cutoff)]

def weighted_mode(x, weights):
    df_temp = pd.DataFrame({'value': x, 'weight': weights})
    return df_temp.groupby('value')['weight'].sum().idxmax()

# ✅ Fixed group list
group = ['Sex', 'Occupation', 'Education_Enrollment']

# Group and apply stats
weighted_stats = df.groupby(group).apply(
    lambda g: pd.Series({
        'Weighted Mean': weighted_mean(g),
        'Weighted Median': weighted_median(g['Weekly_Earnings'], g['PWSSWGT']),
        'Weighted Mode': weighted_mode(g['Weekly_Earnings'], g['PWSSWGT'])
    })
).reset_index()

# Display results
weighted_stats.head()



  weighted_stats = df.groupby(group).apply(


Unnamed: 0,Sex,Occupation,Education_Enrollment,Weighted Mean,Weighted Median,Weighted Mode
0,Female,Architecture and engineering occupations,Enrolled,900.978352,720.0,600.0
1,Female,Architecture and engineering occupations,Not enrolled,1505.42615,1384.61,2884.61
2,Female,"Arts, design, entertainment, sports, and media...",Enrolled,408.018355,220.0,150.0
3,Female,"Arts, design, entertainment, sports, and media...",Not enrolled,1126.608647,961.53,2884.61
4,Female,Building and grounds cleaning and maintenance ...,Enrolled,275.443337,200.0,200.0


In [20]:
# Group by Occupation only
group = ['Occupation']

occupation_stats = df.groupby(group).apply(
    lambda g: pd.Series({
        'Weighted Mean': weighted_mean(g),
        'Weighted Median': weighted_median(g['Weekly_Earnings'], g['PWSSWGT']),
        'Weighted Mode': weighted_mode(g['Weekly_Earnings'], g['PWSSWGT'])
    })
).reset_index()

# Show result
occupation_stats.head()



  occupation_stats = df.groupby(group).apply(


Unnamed: 0,Occupation,Weighted Mean,Weighted Median,Weighted Mode
0,Architecture and engineering occupations,1699.997891,1538.46,2884.61
1,"Arts, design, entertainment, sports, and media...",1163.740594,980.0,2884.61
2,Building and grounds cleaning and maintenance ...,577.037888,500.0,600.0
3,Business and financial operations occupations,1506.398424,1270.0,2884.61
4,Community and social service occupations,1074.794449,952.75,1000.0


In [21]:
print(df['Sex'].unique())


['Male' 'Female']


In [22]:
# Step 1: Helper for weighted mean
def weighted_mean(x):
    return (x['Weekly_Earnings'] * x['PWSSWGT']).sum() / x['PWSSWGT'].sum()

# Step 2: Group by Occupation and Sex, compute weighted means
weighted_means = df.groupby(['Occupation', 'Sex']).apply(weighted_mean).unstack()

# Step 3: Capitalize column names (optional, for 'Male'/'Female')
weighted_means.columns.name = None
weighted_means = weighted_means.rename(columns=lambda x: x.capitalize())

# Step 4: Calculate the gender pay gap (Female - Male)
weighted_means['Gap'] = weighted_means['Female'] - weighted_means['Male']

# Step 5: Sort by the gap
gender_pay_gap = weighted_means.sort_values('Gap')

# Step 6: Show the result
gender_pay_gap


  weighted_means = df.groupby(['Occupation', 'Sex']).apply(weighted_mean).unstack()


Unnamed: 0_level_0,Female,Male,Gap
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Legal occupations,1498.487647,2156.503314,-658.015667
Healthcare practitioner and technical occupations,1239.922547,1695.505566,-455.583019
Sales and related occupations,689.555254,1131.923704,-442.36845
Management occupations,1466.456096,1863.439404,-396.983308
Business and financial operations occupations,1341.284385,1716.284824,-375.000438
Computer and mathematical occupations,1543.0776,1851.355906,-308.278306
Protective service occupations,805.198574,1108.155184,-302.95661
Education instruction and library occupations,1002.850587,1294.042587,-291.192
Production occupations,664.815188,933.564872,-268.749684
Architecture and engineering occupations,1481.23981,1742.10029,-260.86048


In [20]:
csv_path = "outputs/equity_gap.csv"
df.to_csv(csv_path)

In [24]:
# Load CSV
df = pd.read_csv("/Users/joeportnoy/Desktop/repos/Money-Talks-A-Demographic-Story/outputs/bls_data.csv")

# Ensure numeric types
df["Weekly Earnings"] = pd.to_numeric(df["Weekly Earnings"], errors="coerce")
df["PWSSWGT"] = pd.to_numeric(df["PWSSWGT"], errors="coerce")

# Drop rows with missing critical values
df.dropna(subset=["Weekly Earnings", "Sex", "year", "PWSSWGT"], inplace=True)

# Function to compute weighted stats
def weighted_stats(data):
    weights = data["PWSSWGT"]
    values = data["Weekly Earnings"]

    # Weighted mean
    mean = np.average(values, weights=weights)

    # Weighted median
    sorted_idx = np.argsort(values)
    sorted_vals = values.iloc[sorted_idx]
    sorted_weights = weights.iloc[sorted_idx]
    cumsum = np.cumsum(sorted_weights)
    cutoff = sorted_weights.sum() / 2.0
    median = sorted_vals[cumsum >= cutoff].iloc[0]

    return {
        "weighted_mean": mean
    }

# Loop over years and sex
results = []
for year in sorted(df["year"].unique()):
    for sex in ["Male", "Female", "All"]:
        if sex == "All":
            group = df[df["year"] == year]
        else:
            group = df[(df["year"] == year) & (df["Sex"] == sex)]

        if not group.empty:
            stats = weighted_stats(group)
            stats["year"] = year
            stats["sex"] = sex
            results.append(stats)

# Create and show DataFrame
summary_df = pd.DataFrame(results)

summary_df

Unnamed: 0,weighted_mean,year,sex
0,974.979698,2014,Male
1,736.564141,2014,Female
2,860.063889,2014,All
3,1004.266437,2015,Male
4,757.607397,2015,Female
5,885.345482,2015,All
6,1027.808794,2016,Male
7,778.277915,2016,Female
8,907.690165,2016,All
9,1057.689582,2017,Male


In [26]:
# Filter to years 2014–2024
filtered_df = summary_df[summary_df['year'].between(2014, 2024)]

# Pivot to wide format
pivot_df = filtered_df.pivot(index='year', columns='sex', values='weighted_mean').reset_index()

# Calculate overall average (simple mean of male and female)
pivot_df['Overall'] = pivot_df[['Male', 'Female']].mean(axis=1)

# Create Plotly figure
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=pivot_df['year'],
    y=pivot_df['Male'],
    mode='lines+markers',
    name='Male',
    line=dict(color='green')
))

fig.add_trace(go.Scatter(
    x=pivot_df['year'],
    y=pivot_df['Female'],
    mode='lines+markers',
    name='Female',
    line=dict(color='yellow')
))

fig.add_trace(go.Scatter(
    x=pivot_df['year'],
    y=pivot_df['Overall'],
    mode='lines+markers',
    name='Overall',
    line=dict(color='blue')
))

# Layout
fig.update_layout(
    title='Weighted Weekly Earnings Over Time (2014–2024)',
    xaxis_title='Year',
    yaxis_title='Weekly Earnings ($)',
    template='plotly_white'
)

fig.show()

In [32]:
# Pivot and calculate averages as before
filtered_df = summary_df[summary_df['year'].between(2014, 2024)]
pivot_df = filtered_df.pivot(index='year', columns='sex', values='weighted_mean').reset_index()
pivot_df['Overall'] = pivot_df[['Male', 'Female']].mean(axis=1)

# Create the figure
fig = go.Figure()
fig.add_trace(go.Scatter(x=pivot_df['year'], y=pivot_df['Male'], mode='lines+markers', name='Male', line=dict(color='green')))
fig.add_trace(go.Scatter(x=pivot_df['year'], y=pivot_df['Female'], mode='lines+markers', name='Female', line=dict(color='yellow')))
fig.add_trace(go.Scatter(x=pivot_df['year'], y=pivot_df['Overall'], mode='lines+markers', name='Overall', line=dict(color='blue')))

# Export to JSON for use in JS
pio.write_json(fig, "/Users/joeportnoy/Desktop/repos/Money-Talks-A-Demographic-Story/static/json/earnings_chart.json")

In [42]:
# Load and clean the data
df = pd.read_csv("/Users/joeportnoy/Desktop/repos/Money-Talks-A-Demographic-Story/outputs/bls_data.csv")
df = df.dropna(subset=['Weekly Earnings', 'Occupation', 'Sex', 'PWSSWGT'])

results = []
occupations = df['Occupation'].unique()

# Calculate weighted averages
for occupation in occupations:
    filtered = df[df['Occupation'] == occupation]

    def weighted_mean(group):
        return round((group['Weekly Earnings'] * group['PWSSWGT']).sum() / group['PWSSWGT'].sum(), 2)

    male_avg = weighted_mean(filtered[filtered['Sex'] == 'Male'])
    female_avg = weighted_mean(filtered[filtered['Sex'] == 'Female'])
    overall_avg = round((male_avg + female_avg) / 2, 2)

    results.append({
        "occupation": occupation,
        "male_avg": male_avg,
        "female_avg": female_avg,
        "overall_avg": overall_avg
    })

# Ensure output directory exists
os.makedirs("../static/json", exist_ok=True)

# Write JSON file
with open("../static/json/compare_wage.json", "w") as f:
    json.dump(results, f, indent=2)

In [33]:

fig1 = px.bar(
    gender_pay_gap.reset_index(),
    x='Gap',
    y='Occupation',
    orientation='h',
    color='Gap',
    color_continuous_scale='RdBu',
    title='Gender Pay Gap by Occupation (Male - Female)',
    labels={'Gap': 'Pay Gap ($)'}
)

fig1.update_layout(yaxis={'categoryorder': 'total ascending'})
fig1.show()


In [34]:
df_compare = weighted_means[['Male', 'Female']].reset_index()

fig2 = px.bar(
    df_compare.melt(id_vars='Occupation', value_vars=['Male', 'Female'], var_name='Gender', value_name='Earnings'),
    x='Earnings',
    y='Occupation',
    color='Gender',
    barmode='group',
    title='Male vs. Female Weekly Earnings by Occupation (Weighted)'
)

fig2.update_layout(yaxis={'categoryorder': 'total ascending'})
fig2.show()

In [None]:
summary_df.to_csv("outputs/avg_wage_data_over_time.csv")

In [None]:
conn.close()