In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import matplotlib.font_manager
from mizani.transforms import log_trans
from mizani.formatters import percent_format
from mizani.formatters import log_format
from plotnine import *
from datetime import datetime
from scipy.stats import norm
import statsmodels.formula.api as smf

from stargazer.stargazer import Stargazer

warnings.filterwarnings("ignore")


In [3]:
xc = pd.read_csv("morg-2014-emp.csv")

In [6]:
xc = xc.loc[lambda x: x["occ2012"] == 10]

In [8]:
# Calculating average earnings for each gender
average_earnings_men = xc[xc['sex'] == 1]['earnwke'].mean()
average_earnings_women = xc[xc['sex'] == 2]['earnwke'].mean()

# Calculating the wage gap
wage_gap = average_earnings_men - average_earnings_women
wage_gap_percentage = (wage_gap / average_earnings_men) * 100

# Printing the results
print(f"Average Earnings for Men: ${average_earnings_men:.2f}")
print(f"Average Earnings for Women: ${average_earnings_women:.2f}")
print(f"Unconditional Gender Wage Gap: ${wage_gap:.2f}")
print(f"Gender Wage Gap as a Percentage of Men's Earnings: {wage_gap_percentage:.2f}%")

Average Earnings for Men: $2109.86
Average Earnings for Women: $1765.52
Unconditional Gender Wage Gap: $344.34
Gender Wage Gap as a Percentage of Men's Earnings: 16.32%


In [10]:
import pandas as pd

# Assuming xc is your DataFrame
# Grouping data by sex and education level and calculating mean earnings
grouped_data = xc.groupby(['sex', 'grade92'])['earnwke'].mean().reset_index()

# Pivoting the data for easier calculation and comparison
pivot_data = grouped_data.pivot(index='grade92', columns='sex', values='earnwke')

# Calculating the wage gap (assuming 1 is men and 2 is women)
pivot_data['wage_gap'] = pivot_data[1] - pivot_data[2]
pivot_data['wage_gap_percentage'] = (pivot_data['wage_gap'] / pivot_data[1]) * 100

# Displaying the result
print(pivot_data)


sex                1            2    wage_gap  wage_gap_percentage
grade92                                                           
39       1476.703333  1175.127813  301.575521            20.422214
40       1723.307600  1310.155102  413.152498            23.974391
41       1512.011765   768.986667  743.025098            49.141489
42       1651.268400  1334.531875  316.736525            19.181408
43       2181.461398  1833.220274  348.241124            15.963662
44       2326.182793  2180.694881  145.487912             6.254363
45       2187.759412  2125.914615   61.844796             2.826855
46       2171.434138  2228.996000  -57.561862            -2.650868


In [11]:
import statsmodels.api as sm
import pandas as pd

# Assuming xc is your DataFrame
# Prepare the data
xc['female'] = xc['sex'].apply(lambda x: 1 if x == 2 else 0)  # Creating a binary variable for female
xc = xc.dropna(subset=['earnwke', 'female', 'grade92'])  # Dropping rows with missing values in key columns

# Defining the model variables
X = xc[['female', 'grade92']]  # Independent variables
X = sm.add_constant(X)  # Adding a constant term to the predictor
y = xc['earnwke']  # Dependent variable

# Creating the model
model = sm.OLS(y, X).fit()

# Printing the summary of the regression
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                earnwke   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.145
Method:                 Least Squares   F-statistic:                     108.8
Date:                Mon, 13 Nov 2023   Prob (F-statistic):           2.46e-44
Time:                        16:38:16   Log-Likelihood:                -10247.
No. Observations:                1274   AIC:                         2.050e+04
Df Residuals:                    1271   BIC:                         2.051e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4705.7274    532.728     -8.833      0.0