In [None]:
#Setup and helper tools
!pip -q install wooldridge statsmodels linearmodels pandas numpy

import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import t
from wooldridge import data as wdata

def ols_hc3(df, y, Xlist):
    X = sm.add_constant(df[Xlist])
    yv = df[y]
    mod = sm.OLS(yv, X, missing='drop').fit(cov_type='HC3')
    return mod

def print_line():
    print('-'*78)

def ci_from_est_se(est, se, df_resid, alpha=0.05):
    from scipy.stats import t
    tcrit = t.ppf(1 - alpha/2, df_resid)
    lo = est - tcrit*se
    hi = est + tcrit*se
    return lo, hi, tcrit

def var_lin_combo(vcov, a):
    return float(a.T @ vcov @ a)

def pretty_pct(x, digits=3):
    return f"{100*x:.{digits}f}%"

In [None]:
#C3)
hprice1 = wdata('hprice1').dropna(subset=['price', 'sqrft', 'bdrms']).copy()
hprice1['lprice'] = np.log(hprice1['price'])

m_c3 = ols_hc3(hprice1, 'lprice', ['sqrft', 'bdrms'])
print_line()
print("C3) HPRICE1: log(price) ~ sqrft + bdrms  (HC3 robust SE)")
print(m_c3.summary())

#(i)
b = m_c3.params
V = m_c3.cov_params()
a = np.array([0, 150, 1])  # weights on [const, sqrft, bdrms]
theta1_hat = float(a @ b.values)
se_theta1 = np.sqrt(var_lin_combo(V.values, a))
ci_lo, ci_hi, tcrit = ci_from_est_se(theta1_hat, se_theta1, m_c3.df_resid, 0.05)

print_line()
print("C3 (i): θ1 = 150*β_sqrft + β_bdrms  (percent change, because dep var is log-price)")
print(f"Estimate θ1̂ = {theta1_hat:.4f}   (SE = {se_theta1:.4f})")
print(f"95% CI for θ1: [{ci_lo:.4f}, {ci_hi:.4f}]")
print("Interpretation: Adding a 150-sqft bedroom is associated with approximately "
      f"{pretty_pct(theta1_hat)} change in price. (Because dependent variable is log(price).)")

#(ii)
print_line()
print("C3 (ii) Reparameterization:")
print("log(price) = β0 + β_sqrft*sqrft + β_bdrms*bdrms + u")
print("          = β0 + β_sqrft*sqrft + (θ1 - 150*β_sqrft)*bdrms + u")
print("          = β0 + θ1*bdrms + β_sqrft*(sqrft - 150*bdrms) + u")
print("=> Estimating this form gives the same θ1̂ and SE as the delta-method above.")

#(iii) We already computed SE(θ1̂) and CI via the delta method above.


------------------------------------------------------------------------------
C3) HPRICE1: log(price) ~ sqrft + bdrms  (HC3 robust SE)
                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.588
Model:                            OLS   Adj. R-squared:                  0.579
Method:                 Least Squares   F-statistic:                     45.93
Date:                Thu, 23 Oct 2025   Prob (F-statistic):           2.99e-14
Time:                        18:11:10   Log-Likelihood:                 19.592
No. Observations:                  88   AIC:                            -33.18
Df Residuals:                      85   BIC:                            -25.75
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.

In [None]:
#C3 Written Answers
#(i) Holding the number of bedrooms constant, each additional square foot is associated
#...with an approximate 0.04% increase
#...in house price, and the p-value (0.000) indicates this effect is highly statistically
#... significantbdrms.
#Bdroms (0.0289), so holding house size constant, each additional bedroom is associated
#... with about a 2.89% higher price,
#...but the p-value (0.418) shows this effect is not statistically significant at the 5% level.
#(ii) Adding a 150-sqft bedroom increases price by about 8.58%, holding total house size constant.
#(iii) Since the confidence interval does not include 0, θ₁ is statistically significant at the 5% level
#...and adding
#... a typical 150-sqft bedroom is associated with an 8.6% increase in price, with a 95% confidence
#... interval from 2.4% to 14.7%.

In [None]:
#C8)
from scipy.stats import t  # make sure this import exists above
d401 = wdata('401ksubs').copy()

singles = (
    d401.query('fsize == 1')[['nettfa', 'inc', 'age']]
        .dropna()
        .rename(columns={'nettfa': 'netfa'})
        .copy()
)

#(i)
n_singles = singles.shape[0]

#(ii)
m_c8 = ols_hc3(singles, 'netfa', ['inc', 'age'])

#(iii) Intercept meaning: will discuss below (print estimate)
#(iv) Test H0: B2 = 1 vs H1: B2 < 1 (so, one-sided)
b2 = m_c8.params['age']; se_b2 = m_c8.bse['age']
t_stat = (b2 - 1)/se_b2
pval_one_sided = t.cdf(t_stat, df=m_c8.df_resid)  # left tail

#(v)
m_c8_simple = ols_hc3(singles, 'netfa', ['inc'])

print('-'*78)
print("C8) 401KSUBS – single-person households (fsize==1)")
print(f"(i) Number of single-person households: {n_singles}")
print('-'*78)
print("C8 (ii) OLS: netfa = b0 + b1*inc + b2*age  (HC3 robust SE)")
print(m_c8.summary())
print('-'*78)
print("C8 (iii) Intercept note:")
print("At inc=0 and age=0, predicted net financial assets equal the intercept;")
print("this point is outside the support of the data, so it's not economically meaningful.")
print('-'*78)
print("C8 (iv) Test H0: beta_age = 1  vs  H1: beta_age < 1 (one-sided)")
print(f"t = {t_stat:.3f},  p(one-sided) = {pval_one_sided:.4f}")
print("Decision at 1% level:", "Reject H0" if pval_one_sided < 0.01 else "Fail to reject H0")
print('-'*78)
print("C8 (v) Simple regression: netfa ~ inc (HC3 SE)")
print(m_c8_simple.summary())
print("Compare b1 here to the multiple model to discuss the role of age.")


------------------------------------------------------------------------------
C8) 401KSUBS – single-person households (fsize==1)
(i) Number of single-person households: 2017
------------------------------------------------------------------------------
C8 (ii) OLS: netfa = b0 + b1*inc + b2*age  (HC3 robust SE)
                            OLS Regression Results                            
Dep. Variable:                  netfa   R-squared:                       0.119
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     43.83
Date:                Thu, 23 Oct 2025   Prob (F-statistic):           2.33e-19
Time:                        18:24:03   Log-Likelihood:                -10524.
No. Observations:                2017   AIC:                         2.105e+04
Df Residuals:                    2014   BIC:                         2.107e+04
Df Model:                           2                  

In [None]:
#C8 Written Answers
#(i) This yields 2,017 observations.
#(ii)
#Income (β₁ = 0.7993):
#For every $1,000 increase in annual income, predicted net financial assets rise
#... by $799 on average, holding age constant.
#The effect is statistically significant (p < 0.001).
#This is slightly surprising that finanical assets only rise by $799 for an increase
#... in $1000 annual income, but this may
#... make sense because of expenses.
#Age (β₂ = 0.8427):
#Each additional year of age increases net financial assets by about $843, holdingg
#... income constant.
#This effect is also highly significant (p < 0.001).
#(iii)
#Intercept (-43.04), which represents predicted assets when inc = 0 and age = 0.
#This is not economically meaningful since such values fall outside the observed data
#... range it only anchors the regression line.
#(iv) One-sided p = .0944, so we fail to reject null at 1% sig level, so no strong evidence
#... that the age effect is less than 1,
#..but the estimate is close to 1 suggesting assets increase roughly dollar for dollar with age.
#(v) The income coefficient changes only slightly from 0.8207 (simple) to 0.7993 (multiple) and
#...age and income are positively
#... correlated, but not enough to cause large bias when age is omitted, as adding age improves
#... explanatory power (R² rises from 0.083 → 0.119).
#To conclude, both income and age are important, but together they still explain only a modest
#... share of the variation in assets.

In [None]:
#C10)
elem = wdata('elem94_95').copy().dropna(subset=['lavgsal','bs'])

#(i)
m_c10a = ols_hc3(elem, 'lavgsal', ['bs'])

b_bs = m_c10a.params['bs']; se_bs = m_c10a.bse['bs']
t0 = b_bs/se_bs
t_vs_neg1 = (b_bs + 1)/se_bs
from scipy.stats import t as tdist
p_two_sided = 2*(1 - tdist.cdf(abs(t0), df=m_c10a.df_resid))
p_two_sided_neg1 = 2*(1 - tdist.cdf(abs(t_vs_neg1), df=m_c10a.df_resid))

#(ii)
elem2 = elem.dropna(subset=['lenrol', 'lstaff'])
m_c10b = ols_hc3(elem2, 'lavgsal', ['bs', 'lenrol', 'lstaff'])

#(v)
elem3 = elem2.dropna(subset=['lunch'])
m_c10c = ols_hc3(elem3, 'lavgsal', ['bs', 'lenrol', 'lstaff', 'lunch'])

print_line()
print("C10) ELEM94_95 – Teacher pay regressions (HC3 SE)")
print("Part (i): lavgsal ~ bs")
print(m_c10a.summary())
print(f"Test β_bs ≠ 0: t = {t0:.3f}, p = {p_two_sided:.4f}")
print(f"Test β_bs ≠ -1: t = {t_vs_neg1:.3f}, p = {p_two_sided_neg1:.4f}")
print_line()
print("Part (ii): Add lenrol and lstaff")
print(m_c10b.summary())
print("Compare β_bs and its SE with part (i); note if coefficient shrinks, grows, and if SE increases.")
print_line()
print("Part (v): Add lunch (poverty proxy)")
print(m_c10c.summary())
print("Check how adding lunch changes the coefficients and R^2; poverty often explains salary differences.")
print_line()
print("C10 (iii) Why is SE on bs larger after adding controls?")
print("Because adding regressors can increase multicollinearity and reduce residual degrees of freedom;")
print("if bs is correlated with the added controls, Var(β̂_bs) rises (X'X becomes less informative).")
print_line()
print("C10 (iv) Why might β̂_lstaff be negative?")
print("Holding enrollment fixed, districts with more staff per student could be those with tighter budgets")
print("or lower salaries; the sign captures partial correlations, not necessarily causal effects.")
print_line()

------------------------------------------------------------------------------
C10) ELEM94_95 – Teacher pay regressions (HC3 SE)
Part (i): lavgsal ~ bs
                            OLS Regression Results                            
Dep. Variable:                lavgsal   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     6.357
Date:                Thu, 23 Oct 2025   Prob (F-statistic):             0.0118
Time:                        18:34:07   Log-Likelihood:                 85.171
No. Observations:                1848   AIC:                            -166.3
Df Residuals:                    1846   BIC:                            -155.3
Df Model:                           1                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|    

In [None]:
#C10 Written Answers
#(i)The coefficient on bs is -0.7951 with a p-value of 0.012, which is
#...statistically significant at the 1% level (p < 0.01).
#...This means we reject the null hypothesis that the slope equals zero.
#... The coefficient is also statistically different from
#..-1, as the 95% confidence interval [-1.413, -0.177] does not include -1.
#Thus, a one-unit increase in the benefits-to-salary ratio is associated with
#...approximately a 79.5% decrease in average teacher salary.
#...This negative relationship is statistically significant.
##(ii) When you add lenrol and lstaff to the regression:
#The coefficient on bs becomes -0.6091 (less negative than before)
#The standard error increases from 0.315 to 0.319
#The p-value increases to 0.058 (now only marginally significant at 10% level,
#... not significant at 5%.
#(iii) Actually, looking at your output, the standard error is larger in part
 #...(ii) (0.319) than in part (i) (0.315), not smaller.
#...This is what we'd typically expect when adding variables - the standard error
#...can increase due to multicollinearity between
#... bs and the enrollment/staff variables. The error variance might decrease
#..., but the standard error can still rise due to correlation
#..among the predictors.
#(iv) The coefficient on lstaff is -0.7137 with a p-value of 0.000. Yes,
#... this is large in magnitude and highly statistically significant.
#.. The negative sign indicates that schools with more staff per student
#... (higher staff-to-student ratio) tend to have lower average teacher
#...salaries, holding other factors constant. This could reflect budget constraints
#...- schools spending more on staff numbers may have less
#... to spend per teacher.
#(v) The coefficient on lunch is -0.0008 with a p-value of 0.000 (highly significant)
#This negative coefficient suggests that teachers in schools with more students
#... receiving free/reduced lunch (a proxy for disadvantage)
#... earn lower salaries, holding other factors constant
#This means teachers are not being compensated for teaching students from
#...disadvantaged backgrounds - in fact, the opposite appears true
#This is concerning from an equity perspective, as it suggests schools serving
#...disadvantaged populations have lower teacher pay
#(vi) Yes, the pattern is consistent with Table 4.1. When additional control
#... variables (enrollment, staff, and lunch) are added, the
#... coefficient on bs becomes less negative and less statistically significant.
#... This suggests that some of the relationship between
#... benefits ratio and salary in the simple regression was capturing omitted
#...variable effects. The more complete model in part (v)
#... with all controls is likely the most reliable specification for understanding
#... these relationships.