<a href="https://colab.research.google.com/github/JiHoonPark96/practice/blob/main/econometrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================
# 0. Import required libraries
# ============================================

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [2]:
# ============================================
# 1. Upload ecls_k5.csv and load it
# ============================================

from google.colab import files

print("Please upload ecls_k5.csv")
uploaded = files.upload()  # Choose ecls_k5.csv from your computer

# Get the uploaded file name (in case it is slightly different)
file_name = list(uploaded.keys())[0]
print("Loaded file:", file_name)

# Read the CSV into a pandas DataFrame
df = pd.read_csv(file_name)

# Quick check of the data
print(df.shape)
df.head()


Please upload ecls_k5.csv


Saving ecls_k5.csv to ecls_k5.csv
Loaded file: ecls_k5.csv
(8105, 40)


Unnamed: 0,region,gender,race,bmi,mom_educ,mom_married_at_birth,family_income,mom_work_status,siblings,hhsize,...,problem_weapons,problem_attacks,has_library_card,has_home_computer,school_has_security,reading_test,math_test,science_test,mom_curr_married,family_type
0,1,2,1,18.18,3,1,120000.0,2,0,3,...,0,0,0,1,0,115.62794,110.00463,112.37262,1,2
1,1,1,1,15.88,5,1,55000.0,1,1,4,...,0,0,0,1,0,103.76482,91.927467,95.199417,1,1
2,1,2,1,16.8,5,1,70000.0,1,1,4,...,0,0,0,1,0,104.08086,105.23853,104.47813,1,1
3,1,2,1,21.47,5,1,50000.0,1,1,4,...,0,0,0,1,0,103.66688,106.17715,105.42258,1,1
4,1,2,1,24.76,3,1,50000.0,2,2,5,...,0,0,0,1,0,96.081596,79.131935,96.237595,1,1


In [3]:
# ============================================
# 2. Inspect basic information
# ============================================

# Column names
print("Columns:\n", df.columns.tolist())

# Summary statistics for key variables
df[['reading_test', 'math_test', 'science_test',
    'mom_educ', 'family_income']].describe()


Columns:
 ['region', 'gender', 'race', 'bmi', 'mom_educ', 'mom_married_at_birth', 'family_income', 'mom_work_status', 'siblings', 'hhsize', 'pct_minority', 'part_dance', 'part_athletics', 'part_club', 'part_music', 'part_art', 'tv_afternoon_mf', 'tv_afterdinner_mf', 'tv_saturday', 'tv_sunday', 'dinner_as_family', 'home_language_nonenglish', 'both_parents', 'school_type', 'problem_crowding', 'problem_turnover', 'problem_parents', 'problem_drugs', 'problem_gangs', 'problem_crime', 'problem_weapons', 'problem_attacks', 'has_library_card', 'has_home_computer', 'school_has_security', 'reading_test', 'math_test', 'science_test', 'mom_curr_married', 'family_type']


Unnamed: 0,reading_test,math_test,science_test,mom_educ,family_income
count,8105.0,8105.0,8105.0,8105.0,8105.0
mean,99.999997,99.999999,99.999999,3.666749,58425.908895
std,10.0,10.0,10.0,1.082419,58918.038891
min,63.087399,66.521202,69.526131,1.0,1.0
25%,93.668907,94.286201,93.411438,3.0,25000.0
50%,101.53018,101.84389,101.60873,4.0,48000.0
75%,107.74442,107.52917,107.57825,5.0,75000.0
max,117.65336,117.0857,120.41129,5.0,999999.99


In [4]:
# ============================================
# 3. Variable construction
# ============================================

# 3.1 Mother’s education dummies
# mom_educ coding from the project description:
# 1 = 1–8 years, 2 = 9–11 years, 3 = HS grad, 4 = some college, 5 = college grad
# We use (1 & 2) as the reference group ("Less than HS / some HS")

df['MomHS']        = (df['mom_educ'] == 3).astype(int)
df['MomSomeCol']   = (df['mom_educ'] == 4).astype(int)
df['MomCollege']   = (df['mom_educ'] == 5).astype(int)
# If you later decide to separate "college" vs "grad school", you can modify here.

# 3.2 Gender: Female dummy (1 = female, 0 = male)
df['Female'] = (df['gender'] == 2).astype(int)

# 3.3 Race dummies (1 = white, 2 = black, 3 = hispanic, 4 = other)
df['RaceBlack']    = (df['race'] == 2).astype(int)
df['RaceHispanic'] = (df['race'] == 3).astype(int)
df['RaceOther']    = (df['race'] == 4).astype(int)
# White (race == 1) is the omitted reference category.

# 3.4 Region dummies (1 = NE, 2 = Midwest, 3 = South, 4 = West)
df['RegionMidwest'] = (df['region'] == 2).astype(int)
df['RegionSouth']   = (df['region'] == 3).astype(int)
df['RegionWest']    = (df['region'] == 4).astype(int)
# Northeast (region == 1) is the reference group.

# 3.5 Non-English home language (already coded 0/1 in the CSV)
df['NonEnglishHome'] = df['home_language_nonenglish']

# 3.6 Log family income
# Safely compute log income only for positive values
df['log_family_income'] = np.where(df['family_income'] > 0,
                                   np.log(df['family_income']),
                                   np.nan)

# 3.7 Both parents present (reconstructed from family_type)
# From the project description:
# 1 = two parents + siblings
# 2 = two parents + no siblings
# 3 = one parent + siblings
# 4 = one parent + no siblings
# 5 = other family type
df['BothParents'] = df['family_type'].isin([1, 2]).astype(int)

# 3.8 Number of siblings & family dinner frequency
df['NumSiblings']    = df['siblings']
df['FamilyDinnerFreq'] = df['dinner_as_family']

# 3.9 Home resources and activities
df['ComputerAtHome'] = df['has_home_computer']
df['LibraryCard']    = df['has_library_card']

# TV watching: create a weekday total (after school + after dinner)
df['TVWeekday']  = df['tv_afternoon_mf'] + df['tv_afterdinner_mf']
df['TVSaturday'] = df['tv_saturday']
df['TVSunday']   = df['tv_sunday']

# After-school activities
df['MusicLessons'] = df['part_music']
df['Clubs']        = df['part_club']
df['Athletics']    = df['part_athletics']

# 3.10 School environment variables (from principal questionnaire)
df['ProblemCrowding']  = df['problem_crowding']
df['ProblemTurnover']  = df['problem_turnover']
df['ProblemParents']   = df['problem_parents']
df['ProblemDrugs']     = df['problem_drugs']
df['ProblemGangs']     = df['problem_gangs']
df['ProblemCrime']     = df['problem_crime']
df['ProblemWeapons']   = df['problem_weapons']
df['ProblemAttacks']   = df['problem_attacks']

df['SchoolSecurity']   = df['school_has_security']


In [5]:
# ============================================
# 4. Descriptive statistics and simple checks
# ============================================

desc_cols = [
    'reading_test', 'math_test', 'science_test',
    'mom_educ', 'MomHS', 'MomSomeCol', 'MomCollege',
    'Female', 'RaceBlack', 'RaceHispanic', 'RaceOther',
    'log_family_income', 'BothParents', 'NumSiblings',
    'FamilyDinnerFreq', 'ComputerAtHome', 'LibraryCard',
    'TVWeekday', 'TVSaturday', 'TVSunday',
    'MusicLessons', 'Clubs', 'Athletics'
]

df[desc_cols].describe()


Unnamed: 0,reading_test,math_test,science_test,mom_educ,MomHS,MomSomeCol,MomCollege,Female,RaceBlack,RaceHispanic,...,NumSiblings,FamilyDinnerFreq,ComputerAtHome,LibraryCard,TVWeekday,TVSaturday,TVSunday,MusicLessons,Clubs,Athletics
count,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,...,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0,8105.0
mean,99.999997,99.999999,99.999999,3.666749,0.340037,0.272424,0.275139,0.492165,0.090438,0.180753,...,1.571622,5.427144,0.873411,0.117705,1.694756,2.808143,2.495373,0.314004,0.302776,0.674522
std,10.0,10.0,10.0,1.082419,0.47375,0.445235,0.446612,0.499969,0.286826,0.384837,...,1.114202,1.768752,0.332532,0.322279,1.229111,1.711001,1.700418,0.464146,0.459488,0.468582
min,63.087399,66.521202,69.526131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,93.668907,94.286201,93.411438,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0
50%,101.53018,101.84389,101.60873,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,6.0,1.0,0.0,2.0,3.0,2.0,0.0,0.0,1.0
75%,107.74442,107.52917,107.57825,5.0,1.0,1.0,1.0,1.0,0.0,0.0,...,2.0,7.0,1.0,0.0,2.0,4.0,3.0,1.0,1.0,1.0
max,117.65336,117.0857,120.41129,5.0,1.0,1.0,1.0,1.0,1.0,1.0,...,10.0,7.0,1.0,1.0,10.0,23.0,20.0,1.0,1.0,1.0


In [6]:
# ============================================
# 5.1 Baseline model: mother’s education only
# ============================================

baseline_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
"""

baseline_model = smf.ols(baseline_formula, data=df).fit(cov_type='HC3')  # robust SE
print(baseline_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.196
Model:                            OLS   Adj. R-squared:                  0.196
Method:                 Least Squares   F-statistic:                     665.3
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:03:52   Log-Likelihood:                -29279.
No. Observations:                8105   AIC:                         5.857e+04
Df Residuals:                    8101   BIC:                         5.859e+04
Df Model:                           3                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     91.0388      0.344    264.728      0.0

In [7]:
# ============================================
# 5.2 Add child demographics: gender, race
# ============================================

demo_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
               + Female
               + RaceBlack + RaceHispanic + RaceOther
"""

demo_model = smf.ols(demo_formula, data=df).fit(cov_type='HC3')
print(demo_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.237
Model:                            OLS   Adj. R-squared:                  0.236
Method:                 Least Squares   F-statistic:                     355.3
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:04:03   Log-Likelihood:                -29066.
No. Observations:                8105   AIC:                         5.815e+04
Df Residuals:                    8097   BIC:                         5.820e+04
Df Model:                           7                                         
Covariance Type:                  HC3                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       93.2704      0.410    227.422   

In [8]:
# ============================================
# 5.3 Add family background and home environment
# ============================================

family_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
               + Female
               + RaceBlack + RaceHispanic + RaceOther
               + log_family_income
               + BothParents
               + NumSiblings
               + FamilyDinnerFreq
               + NonEnglishHome
               + ComputerAtHome
               + LibraryCard
"""

family_model = smf.ols(family_formula, data=df).fit(cov_type='HC3')
print(family_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.274
Model:                            OLS   Adj. R-squared:                  0.273
Method:                 Least Squares   F-statistic:                     205.2
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:04:14   Log-Likelihood:                -28863.
No. Observations:                8105   AIC:                         5.776e+04
Df Residuals:                    8090   BIC:                         5.786e+04
Df Model:                          14                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            79.9207      1.64

In [9]:
# ============================================
# 5.4 Add region and school environment
# ============================================

school_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
               + Female
               + RaceBlack + RaceHispanic + RaceOther
               + log_family_income
               + BothParents
               + NumSiblings
               + FamilyDinnerFreq
               + NonEnglishHome
               + ComputerAtHome
               + LibraryCard
               + RegionMidwest + RegionSouth + RegionWest
               + SchoolSecurity
               + ProblemCrowding + ProblemTurnover
               + ProblemParents + ProblemDrugs
               + ProblemGangs + ProblemCrime
               + ProblemWeapons + ProblemAttacks
"""

school_model = smf.ols(school_formula, data=df).fit(cov_type='HC3')
print(school_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.281
Model:                            OLS   Adj. R-squared:                  0.278
Method:                 Least Squares   F-statistic:                     114.8
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:04:23   Log-Likelihood:                -28827.
No. Observations:                8105   AIC:                         5.771e+04
Df Residuals:                    8078   BIC:                         5.790e+04
Df Model:                          26                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            81.5039      1.65

In [10]:
# ============================================
# 5.5 Full Extended Model (mother’s education + all controls)
# ============================================

full_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
               + Female
               + RaceBlack + RaceHispanic + RaceOther
               + NonEnglishHome
               + log_family_income
               + BothParents
               + NumSiblings
               + FamilyDinnerFreq
               + RegionMidwest + RegionSouth + RegionWest
               + ComputerAtHome
               + TVWeekday + TVSaturday + TVSunday
               + MusicLessons + Clubs + Athletics
               + LibraryCard
               + SchoolSecurity
               + ProblemCrowding + ProblemTurnover
               + ProblemParents + ProblemDrugs
               + ProblemGangs + ProblemCrime
               + ProblemWeapons + ProblemAttacks
"""

full_model = smf.ols(full_formula, data=df).fit(cov_type='HC3')
print(full_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.292
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     98.78
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:04:36   Log-Likelihood:                -28766.
No. Observations:                8105   AIC:                         5.760e+04
Df Residuals:                    8072   BIC:                         5.783e+04
Df Model:                          32                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            81.8026      1.65

In [11]:
# ============================================
# 6. Example: Interaction term (MomCollege × LowIncome)
# ============================================

# Define a low-income dummy (for example: bottom 25% of family income)
income_q1 = df['family_income'].quantile(0.25)
df['LowIncome'] = (df['family_income'] <= income_q1).astype(int)

# Interaction: MomCollege × LowIncome
df['MomCollege_LowInc'] = df['MomCollege'] * df['LowIncome']

interaction_formula = """
reading_test ~ MomHS + MomSomeCol + MomCollege
               + MomCollege_LowInc
               + Female
               + RaceBlack + RaceHispanic + RaceOther
               + log_family_income
               + BothParents
               + NumSiblings
               + NonEnglishHome
"""

interaction_model = smf.ols(interaction_formula, data=df).fit(cov_type='HC3')
print(interaction_model.summary())


                            OLS Regression Results                            
Dep. Variable:           reading_test   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.263
Method:                 Least Squares   F-statistic:                     227.9
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:05:08   Log-Likelihood:                -28918.
No. Observations:                8105   AIC:                         5.786e+04
Df Residuals:                    8092   BIC:                         5.795e+04
Df Model:                          12                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            78.6869      1.74

In [12]:
# Same full model but with math_test as the dependent variable

full_formula_math = full_formula.replace("reading_test", "math_test")

full_model_math = smf.ols(full_formula_math, data=df).fit(cov_type='HC3')
print(full_model_math.summary())


                            OLS Regression Results                            
Dep. Variable:              math_test   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.261
Method:                 Least Squares   F-statistic:                     85.96
Date:                Fri, 05 Dec 2025   Prob (F-statistic):               0.00
Time:                        04:05:26   Log-Likelihood:                -28918.
No. Observations:                8105   AIC:                         5.790e+04
Df Residuals:                    8072   BIC:                         5.813e+04
Df Model:                          32                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            83.3277      1.74