<a href="https://colab.research.google.com/github/JiHoonPark96/practice/blob/main/econometrics_1208.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ============================================
# 1. Import required libraries
# ============================================

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [5]:
# ============================================
# 2. Upload ecls_k5.csv and load the data
# ============================================

from google.colab import files

print("Please upload ecls_k5.csv")
uploaded = files.upload()  # choose ecls_k5.csv

file_name = list(uploaded.keys())[0]
print("Loaded file:", file_name)

df = pd.read_csv(file_name)

print("Shape:", df.shape)
print("Columns:")
print(df.columns.tolist())

df.head()


Please upload ecls_k5.csv


Saving ecls_k5.csv to ecls_k5 (1).csv
Loaded file: ecls_k5 (1).csv
Shape: (8105, 40)
Columns:
['region', 'gender', 'race', 'bmi', 'mom_educ', 'mom_married_at_birth', 'family_income', 'mom_work_status', 'siblings', 'hhsize', 'pct_minority', 'part_dance', 'part_athletics', 'part_club', 'part_music', 'part_art', 'tv_afternoon_mf', 'tv_afterdinner_mf', 'tv_saturday', 'tv_sunday', 'dinner_as_family', 'home_language_nonenglish', 'both_parents', 'school_type', 'problem_crowding', 'problem_turnover', 'problem_parents', 'problem_drugs', 'problem_gangs', 'problem_crime', 'problem_weapons', 'problem_attacks', 'has_library_card', 'has_home_computer', 'school_has_security', 'reading_test', 'math_test', 'science_test', 'mom_curr_married', 'family_type']


Unnamed: 0,region,gender,race,bmi,mom_educ,mom_married_at_birth,family_income,mom_work_status,siblings,hhsize,...,problem_weapons,problem_attacks,has_library_card,has_home_computer,school_has_security,reading_test,math_test,science_test,mom_curr_married,family_type
0,1,2,1,18.18,3,1,120000.0,2,0,3,...,0,0,0,1,0,115.62794,110.00463,112.37262,1,2
1,1,1,1,15.88,5,1,55000.0,1,1,4,...,0,0,0,1,0,103.76482,91.927467,95.199417,1,1
2,1,2,1,16.8,5,1,70000.0,1,1,4,...,0,0,0,1,0,104.08086,105.23853,104.47813,1,1
3,1,2,1,21.47,5,1,50000.0,1,1,4,...,0,0,0,1,0,103.66688,106.17715,105.42258,1,1
4,1,2,1,24.76,3,1,50000.0,2,2,5,...,0,0,0,1,0,96.081596,79.131935,96.237595,1,1


In [6]:
# ============================================
# 3. Variable construction (Key Determinants)
# ============================================

# 3.1 Maternal education
# mom_educ: 1=1–8 years, 2=9–11, 3=HS grad, 4=some college, 5=college grad

# (a) Ordinal version
df['mom_educ_ord'] = df['mom_educ']

# (b) Dummy version: low vs medium vs high
#   low     : mom_educ in {1, 2}  (reference group)
#   medium  : mom_educ in {3, 4}
#   high    : mom_educ == 5
df['mom_high'] = (df['mom_educ'] == 5).astype(int)
df['mom_medium'] = df['mom_educ'].isin([3, 4]).astype(int)

# 3.2 Demographic controls
# gender: 1 = male, 2 = female
df['female'] = (df['gender'] == 2).astype(int)

# race: 1=white, 2=black, 3=hispanic, 4=other
# We will treat race as categorical (C(race)) in formula,
# but we can also define dummy variables if needed.
df['race_black'] = (df['race'] == 2).astype(int)
df['race_hispanic'] = (df['race'] == 3).astype(int)
df['race_other'] = (df['race'] == 4).astype(int)

# 3.3 SES: income and region
df['log_family_income'] = np.where(df['family_income'] > 0,
                                   np.log(df['family_income']),
                                   np.nan)

# region: 1=NE, 2=Midwest, 3=South, 4=West
# We will use C(region) in the regression model.

# 3.4 Home environment
# has_home_computer: already 0/1
# dinner_as_family: number of days eating together per week
# extracurricular_invest: sum of after-school activities
extra_cols = ['part_dance', 'part_art', 'part_music', 'part_club', 'part_athletics']

for col in extra_cols:
    if col not in df.columns:
        print(f"Warning: {col} is not found in the dataset")

df['extracurricular_invest'] = df[extra_cols].sum(axis=1)

# 3.5 Family structure / resource availability
df['siblings'] = df['siblings']
df['hhsize'] = df['hhsize']
df['both_parents'] = df['both_parents']
df['mom_married_at_birth'] = df['mom_married_at_birth']

# 3.6 Language and cultural background
df['home_language_nonenglish'] = df['home_language_nonenglish']

# 3.7 TV and library/computer (optional, for robustness or extensions)
df['tv_weekday'] = df['tv_afternoon_mf'] + df['tv_afterdinner_mf']
df['tv_saturday'] = df['tv_saturday']
df['tv_sunday'] = df['tv_sunday']

df['has_library_card'] = df['has_library_card']
df['has_home_computer'] = df['has_home_computer']


In [7]:
# ============================================
# 4. Create total score and log-transformed outcomes
# ============================================

# Total score as the sum of all three tests
df['total_score'] = df['math_test'] + df['reading_test'] + df['science_test']

# Log-transformed scores
df['log_math'] = np.where(df['math_test'] > 0, np.log(df['math_test']), np.nan)
df['log_reading'] = np.where(df['reading_test'] > 0, np.log(df['reading_test']), np.nan)
df['log_science'] = np.where(df['science_test'] > 0, np.log(df['science_test']), np.nan)
df['log_total'] = np.where(df['total_score'] > 0, np.log(df['total_score']), np.nan)

df[['math_test', 'reading_test', 'science_test', 'total_score',
    'log_math', 'log_reading', 'log_science', 'log_total']].describe()


Unnamed: 0,math_test,reading_test,science_test,total_score,log_math,log_reading,log_science,log_total
count,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0
mean,99.999999,99.999997,99.999999,299.999995,4.59981,4.59985,4.599901,5.699398
std,10.0,10.0,10.0,27.338451,0.105481,0.10492,0.104118,0.095019
min,66.521202,63.087399,69.526131,203.520425,4.197521,4.144521,4.241703,5.315766
25%,94.286201,93.668907,93.411438,283.178281,4.546335,4.539766,4.537014,5.646077
50%,101.84389,101.53018,101.60873,304.724082,4.623441,4.620356,4.621129,5.719407
75%,107.52917,107.74442,107.57825,320.9081,4.677762,4.679762,4.678218,5.771155
max,117.0857,117.65336,120.41129,352.32562,4.762906,4.767743,4.790913,5.864556


In [8]:
# ============================================
# 5. Descriptive statistics for key determinants
# ============================================

desc_cols = [
    'log_math', 'log_reading', 'log_science', 'log_total',
    'mom_educ_ord', 'mom_high', 'mom_medium',
    'log_family_income', 'female', 'race',
    'siblings', 'hhsize', 'both_parents', 'mom_married_at_birth',
    'extracurricular_invest', 'dinner_as_family',
    'has_home_computer', 'has_library_card',
    'home_language_nonenglish'
]

df[desc_cols].describe()


Unnamed: 0,log_math,log_reading,log_science,log_total,mom_educ_ord,mom_high,mom_medium,log_family_income,female,race,siblings,hhsize,both_parents,mom_married_at_birth,extracurricular_invest,dinner_as_family,has_home_computer,has_library_card,home_language_nonenglish
count,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0
mean,4.59981,4.59985,4.599901,5.699398,3.666749,0.275139,0.612461,10.628958,0.492165,1.764343,1.571622,4.589389,0.810611,0.784454,1.540777,5.427144,0.873411,0.117705,0.143615
std,0.105481,0.10492,0.104118,0.095019,1.082419,0.446612,0.487218,0.952524,0.499969,1.080079,1.114202,1.329551,0.391842,0.411226,1.08787,1.768752,0.332532,0.322279,0.350721
min,4.197521,4.144521,4.241703,5.315766,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.546335,4.539766,4.537014,5.646077,3.0,0.0,0.0,10.126631,0.0,1.0,1.0,4.0,1.0,1.0,1.0,4.0,1.0,0.0,0.0
50%,4.623441,4.620356,4.621129,5.719407,4.0,0.0,1.0,10.778956,0.0,1.0,1.0,4.0,1.0,1.0,1.0,6.0,1.0,0.0,0.0
75%,4.677762,4.679762,4.678218,5.771155,5.0,1.0,1.0,11.225243,1.0,3.0,2.0,5.0,1.0,1.0,2.0,7.0,1.0,0.0,0.0
max,4.762906,4.767743,4.790913,5.864556,5.0,1.0,1.0,13.815511,1.0,4.0,10.0,15.0,1.0,1.0,5.0,7.0,1.0,1.0,1.0


In [9]:
# ============================================
# 6. Block-wise regression function (Models 1–6)
# ============================================

def run_block_models(data, dep_var, use_dummy_spec=False):
    """
    Runs the 6 block-wise regression models for a given dependent variable.

    Parameters
    ----------
    data : pandas DataFrame
    dep_var : str
        Dependent variable name (e.g., 'log_math', 'log_reading', 'log_total').
    use_dummy_spec : bool
        If True, use mom_high and mom_medium (reference: low education).
        If False, use mom_educ_ord as a single ordinal predictor.
    """
    df_model = data.copy()

    # Variables that must be non-missing
    needed_cols = [
        dep_var,
        'mom_educ_ord', 'mom_high', 'mom_medium',
        'female', 'race',
        'log_family_income', 'region',
        'has_home_computer', 'dinner_as_family', 'extracurricular_invest',
        'siblings', 'hhsize', 'both_parents', 'mom_married_at_birth',
        'home_language_nonenglish'
    ]

    df_model = df_model.dropna(subset=needed_cols)

    print("Dependent variable:", dep_var)
    print("Number of observations used:", df_model.shape[0])
    print()

    # Maternal education specification
    if use_dummy_spec:
        mom_term = "mom_high + mom_medium"
    else:
        mom_term = "mom_educ_ord"

    # Model 1: Baseline (maternal education only)
    formula1 = f"{dep_var} ~ {mom_term}"

    # Model 2: + Demographics
    formula2 = formula1 + " + female + C(race)"

    # Model 3: + SES (income, region)
    formula3 = formula2 + " + log_family_income + C(region)"

    # Model 4: + Home environment
    formula4 = formula3 + " + has_home_computer + dinner_as_family + extracurricular_invest"

    # Model 5: + Family structure / resource dilution
    formula5 = formula4 + " + siblings + hhsize + both_parents + mom_married_at_birth"

    # Model 6: + Language & interaction (Non-English home)
    if use_dummy_spec:
        interaction_part = "mom_high:home_language_nonenglish + mom_medium:home_language_nonenglish"
    else:
        interaction_part = "mom_educ_ord:home_language_nonenglish"

    formula6 = formula5 + f" + home_language_nonenglish + {interaction_part}"

    formulas = [
        ("Model 1: Baseline (Maternal education only)", formula1),
        ("Model 2: + Demographics", formula2),
        ("Model 3: + SES", formula3),
        ("Model 4: + Home environment", formula4),
        ("Model 5: + Family structure", formula5),
        ("Model 6: + Language & interaction", formula6)
    ]

    results = {}

    for label, f in formulas:
        print("=" * 80)
        print(label)
        print("Formula:", f)
        model = smf.ols(f, data=df_model).fit(cov_type='HC3')  # robust SEs
        print(model.summary())
        print()
        results[label] = model

    return results


In [10]:
# ============================================
# 7. Run block-wise models for each outcome
# ============================================

# Example 1: log of math, using maternal education dummies
math_results = run_block_models(df, dep_var='log_math', use_dummy_spec=True)

# Example 2: log of reading
reading_results = run_block_models(df, dep_var='log_reading', use_dummy_spec=True)

# Example 3: log of science
science_results = run_block_models(df, dep_var='log_science', use_dummy_spec=True)

# Example 4: log of total score
total_results = run_block_models(df, dep_var='log_total', use_dummy_spec=True)


Dependent variable: log_math
Number of observations used: 8105

Model 1: Baseline (Maternal education only)
Formula: log_math ~ mom_high + mom_medium
                            OLS Regression Results                            
Dep. Variable:               log_math   R-squared:                       0.142
Model:                            OLS   Adj. R-squared:                  0.141
Method:                 Least Squares   F-statistic:                     698.6
Date:                Mon, 08 Dec 2025   Prob (F-statistic):          1.29e-280
Time:                        06:09:21   Log-Likelihood:                 7348.4
No. Observations:                8105   AIC:                        -1.469e+04
Df Residuals:                    8102   BIC:                        -1.467e+04
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      

In [11]:
# ============================================
# 8. Collect results into a summary table
# ============================================

def significance_stars(p):
    """Return significance stars for a given p-value."""
    if p < 0.01:
        return "***"
    elif p < 0.05:
        return "**"
    elif p < 0.10:
        return "*"
    else:
        return ""

def summarize_models(results_dict):
    """
    Convert a dictionary of statsmodels results (from run_block_models)
    into a long-format DataFrame with coefficients, standard errors,
    p-values, and significance stars.
    """
    rows = []
    for model_name, model in results_dict.items():
        for var in model.params.index:
            coef = model.params[var]
            se = model.bse[var]
            pval = model.pvalues[var]
            rows.append({
                "Model": model_name,
                "Variable": var,
                "Coefficient": coef,
                "Std_Error": se,
                "p_value": pval,
                "Significance": significance_stars(pval)
            })
    return pd.DataFrame(rows)

math_report = summarize_models(math_results)
reading_report = summarize_models(reading_results)
science_report = summarize_models(science_results)
total_report = summarize_models(total_results)

# For example, preview the math report
math_report.head(20)


Unnamed: 0,Model,Variable,Coefficient,Std_Error,p_value,Significance
0,Model 1: Baseline (Maternal education only),Intercept,4.517789,0.004027,0.0,***
1,Model 1: Baseline (Maternal education only),mom_high,0.134977,0.004338,1.5593949999999998e-212,***
2,Model 1: Baseline (Maternal education only),mom_medium,0.073284,0.004277,7.990117e-66,***
3,Model 2: + Demographics,Intercept,4.555833,0.004648,0.0,***
4,Model 2: + Demographics,C(race)[T.2],-0.080865,0.004243,5.634035000000001e-81,***
5,Model 2: + Demographics,C(race)[T.3],-0.030777,0.003317,1.719226e-20,***
6,Model 2: + Demographics,C(race)[T.4],-0.006909,0.00363,0.05698734,*
7,Model 2: + Demographics,mom_high,0.112965,0.004643,9.240457e-131,***
8,Model 2: + Demographics,mom_medium,0.060018,0.004444,1.438579e-41,***
9,Model 2: + Demographics,female,-0.02086,0.002099,2.788734e-23,***


In [12]:
# ============================================
# 9. Narrative-style summary for maternal education
# ============================================

def narrative_summary(results_dict, mom_var_high='mom_high', mom_var_medium='mom_medium'):
    """
    Print how the coefficients on maternal education dummies change
    across Models 1–6.
    """
    print("==============================================")
    print("Narrative Summary of Maternal Education Effects")
    print("==============================================\n")

    for model_name, model in results_dict.items():
        print(model_name)
        for mom_var in [mom_var_high, mom_var_medium]:
            if mom_var in model.params.index:
                coef = model.params[mom_var]
                p = model.pvalues[mom_var]
                stars = significance_stars(p)
                print(f"  {mom_var}: {coef:.4f} (p = {p:.4f}) {stars}")
        print()

# Example: log_math models
narrative_summary(math_results)


Narrative Summary of Maternal Education Effects

Model 1: Baseline (Maternal education only)
  mom_high: 0.1350 (p = 0.0000) ***
  mom_medium: 0.0733 (p = 0.0000) ***

Model 2: + Demographics
  mom_high: 0.1130 (p = 0.0000) ***
  mom_medium: 0.0600 (p = 0.0000) ***

Model 3: + SES
  mom_high: 0.0906 (p = 0.0000) ***
  mom_medium: 0.0485 (p = 0.0000) ***

Model 4: + Home environment
  mom_high: 0.0727 (p = 0.0000) ***
  mom_medium: 0.0378 (p = 0.0000) ***

Model 5: + Family structure
  mom_high: 0.0678 (p = 0.0000) ***
  mom_medium: 0.0350 (p = 0.0000) ***

Model 6: + Language & interaction
  mom_high: 0.0802 (p = 0.0000) ***
  mom_medium: 0.0484 (p = 0.0000) ***

