In [39]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [40]:
# table 1 - household data_v1.dta
data_paths = [
    "household-data_v1.dta"
]

In [41]:
df1 = [pd.read_stata(path) for path in data_paths]
df1 = pd.concat(df1)

df1

Unnamed: 0,hhid,weight,stateid,districtid,urban,brahmin,landowner,men_age,men_age2,men_age3,...,landless,marginalfarmer,smallfarmer,semi_medfarmer,mediumfarmer,largefarmer,farmer_type,hh_total_consumption,kids,marriagepolicy
0,20101021,611.10,2,201,0,0.0,1,37.0,1369.0,50653.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24420.0,3.0,0.0
1,20101051,611.10,2,201,0,0.0,0,35.5,1532.5,73733.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,165888.0,1.0,0.0
2,20101071,611.10,2,201,0,0.0,1,32.0,1024.0,32768.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,202968.0,3.0,0.0
3,20101081,611.10,2,201,0,0.0,1,39.0,1521.0,59319.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38808.0,1.0,0.0
4,20101082,611.10,2,201,0,0.0,1,24.0,576.0,13824.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,28896.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,333011100,5234.43,33,3330,1,0.0,0,47.0,2209.0,103823.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25800.0,0.0,0.0
30903,333011120,5234.43,33,3330,1,0.0,0,46.0,2116.0,97336.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26352.0,4.0,0.0
30904,333011130,5234.43,33,3330,1,0.0,0,52.0,3104.0,203008.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21300.0,2.0,1.0
30905,333011140,5234.43,33,3330,1,0.0,0,20.0,400.0,8000.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21996.0,0.0,0.0


In [42]:
# Drop observations where the value of the variable "state" is equal to 19, 18, or 32
df1 = df1[~df1['stateid'].isin([19, 18, 32])]

# Create a new variable "old" which is the sum of "num_old_men" and "num_old_women"
df1['old'] = df1['num_old_men'] + df1['num_old_women']

# Drop observations where "eligwomanage" is less than 15
df1 = df1[df1['eligwomanage'] >= 15]

# Drop observations where "marriageage" is less than 6 or greater than 30
df1 = df1[(df1['marriageage'] >= 6) & (df1['marriageage'] <= 30)]

# Drop observations where "hhsize" is greater than 15
df1 = df1[df1['hhsize'] <= 15]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['old'] = df1['num_old_men'] + df1['num_old_women']


In [43]:
# table 1 column 1

# drop all NAs
df1.dropna(subset=['eligwomaneducation', 'marriagepolicy', 'marriageage', 'BPL', 'stateid', 'yom', 'men_age', 'women_age', 'caste', 'hh_total_income', 'districtid'], inplace=True)

# Define formula
formula = "eligwomaneducation ~ marriagepolicy + marriageage + BPL + C(stateid) + C(yom) + men_age + women_age + C(caste) + hh_total_income + C(districtid)"

# Define the subset of data based on conditions
subset_data = df1[(df1['hindu'] == 1) & (df1['urban'] == 0) & (df1['hh_total_income'] > 1000) & (df1['hh_total_income'] < 160000) & (df1['hhsize'] <= 15) & (df1['landowner'] == 1)]

# Fit the regression model
model = smf.wls(formula, data=subset_data, weights=subset_data['weight']).fit(cov_type='cluster', cov_kwds={'groups': subset_data['districtid']})

# Display the summary of the regression results
print(model.summary())

# 0.48 matches 

                            WLS Regression Results                            
Dep. Variable:     eligwomaneducation   R-squared:                       0.385
Model:                            WLS   Adj. R-squared:                  0.365
Method:                 Least Squares   F-statistic:                -1.485e+13
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               1.00
Time:                        20:52:07   Log-Likelihood:                -25270.
No. Observations:                9270   AIC:                         5.112e+04
Df Residuals:                    8981   BIC:                         5.318e+04
Df Model:                         288                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 1.20



In [44]:
# table 1 column 2

# drop all NAs
df1.dropna(subset=['eligwomaneducation', 'marriagepolicy', 'marriageage', 'BPL', 'stateid', 'yom', 'men_age', 'women_age', 'caste', 'hh_total_income', 'districtid', 
                  'landdowrypractice', 'samecastehusband', 'sameeconstatus', 'betteroffstatus', 'worseoffstatus', 'husbandbloodrelative'], inplace=True)

# Define formula
formula = "eligwomaneducation ~ marriagepolicy + marriageage + BPL + C(stateid) + C(yom) + men_age + women_age + C(caste) + hh_total_income + C(districtid) + landdowrypractice + samecastehusband + sameeconstatus + betteroffstatus + worseoffstatus + husbandbloodrelative"

# Define the subset of data based on conditions
subset_data = df1[(df1['hindu'] == 1) & (df1['urban'] == 0) & (df1['hh_total_income'] > 1000) & (df1['hh_total_income'] < 160000) & (df1['hhsize'] <= 15) & (df1['landowner'] == 1)]

# Fit the regression model
model = smf.wls(formula, data=subset_data, weights=subset_data['weight']).fit(cov_type='cluster', cov_kwds={'groups': subset_data['districtid']})

# Display the summary of the regression results
print(model.summary())

# 0.48 marriagepolicy matches 

                            WLS Regression Results                            
Dep. Variable:     eligwomaneducation   R-squared:                       0.402
Model:                            WLS   Adj. R-squared:                  0.375
Method:                 Least Squares   F-statistic:                -3.187e+13
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               1.00
Time:                        20:52:08   Log-Likelihood:                -17674.
No. Observations:                6542   AIC:                         3.592e+04
Df Residuals:                    6258   BIC:                         3.784e+04
Df Model:                         283                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -0.77



In [45]:
# table 1 column 3

# drop all NAs
df1.dropna(subset=['eligwomaneducation', 'marriagepolicy', 'marriageage', 'BPL', 'stateid', 'yom', 'men_age', 'women_age', 'landowner', 'caste', 'farmer_type', 'household_occtype', 
                   'landdowrypractice', 'samecastehusband', 'sameeconstatus', 'betteroffstatus', 'worseoffstatus', 'husbandbloodrelative' , 'districtid'], inplace=True)

# Define formula
formula = "eligwomaneducation ~ marriagepolicy + marriageage + BPL + C(stateid) + C(yom) + men_age + women_age + landowner + C(caste) + C(farmer_type) + C(household_occtype) + landdowrypractice + samecastehusband + sameeconstatus + betteroffstatus + worseoffstatus + husbandbloodrelative + C(districtid)"

# Define the subset of data based on conditions
subset_data = df1[(df1['hindu'] == 1) & (df1['urban'] == 0) & (df1['hh_total_income'] > 1000) & (df1['hh_total_income'] < 160000) & (df1['hhsize'] <= 15) & (df1['landowner'] == 1)]

# Fit the regression model
model = smf.wls(formula, data=subset_data, weights=subset_data['weight']).fit(cov_type='cluster', cov_kwds={'groups': subset_data['districtid']})

# Display the summary of the regression results
print(model.summary())

# 0.51 marriagepolicy matches 

                            WLS Regression Results                            
Dep. Variable:     eligwomaneducation   R-squared:                       0.392
Model:                            WLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                 2.455e+13
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        20:52:08   Log-Likelihood:                -17388.
No. Observations:                6410   AIC:                         3.537e+04
Df Residuals:                    6115   BIC:                         3.736e+04
Df Model:                         294                                         
Covariance Type:              cluster                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         



In [46]:
# table 3 - household data_v1.dta
data_paths = [
    "household-data_v1.dta"
]

df3 = [pd.read_stata(path) for path in data_paths]
df3 = pd.concat(df3)
df3

Unnamed: 0,hhid,weight,stateid,districtid,urban,brahmin,landowner,men_age,men_age2,men_age3,...,landless,marginalfarmer,smallfarmer,semi_medfarmer,mediumfarmer,largefarmer,farmer_type,hh_total_consumption,kids,marriagepolicy
0,20101021,611.10,2,201,0,0.0,1,37.0,1369.0,50653.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24420.0,3.0,0.0
1,20101051,611.10,2,201,0,0.0,0,35.5,1532.5,73733.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,165888.0,1.0,0.0
2,20101071,611.10,2,201,0,0.0,1,32.0,1024.0,32768.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,202968.0,3.0,0.0
3,20101081,611.10,2,201,0,0.0,1,39.0,1521.0,59319.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38808.0,1.0,0.0
4,20101082,611.10,2,201,0,0.0,1,24.0,576.0,13824.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,28896.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,333011100,5234.43,33,3330,1,0.0,0,47.0,2209.0,103823.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25800.0,0.0,0.0
30903,333011120,5234.43,33,3330,1,0.0,0,46.0,2116.0,97336.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26352.0,4.0,0.0
30904,333011130,5234.43,33,3330,1,0.0,0,52.0,3104.0,203008.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21300.0,2.0,1.0
30905,333011140,5234.43,33,3330,1,0.0,0,20.0,400.0,8000.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21996.0,0.0,0.0


In [47]:
# Drop observations where state is 19, 18, or 32
df3 = df3[df3['stateid'] != 19]
df3 = df3[df3['stateid'] != 18]
df3 = df3[df3['stateid'] != 32]

# Create a new variable 'old' by summing 'num_old_men' and 'num_old_women'
df3['old'] = df3['num_old_men'] + df3['num_old_women']

# Drop observations where 'eligwomanage' is less than 15
df3 = df3[df3['eligwomanage'] >= 15]

# Drop observations where 'marriageage' is less than 6 or greater than 30
df3 = df3[(df3['marriageage'] >= 6) & (df3['marriageage'] <= 30)]

# Drop observations where 'hhsize' is greater than 15
df3 = df3[df3['hhsize'] <= 15]

In [48]:
# Generate a new column 'falsemarriagepolicy' and initialize it with 0
df3['falsemarriagepolicy'] = 0

# Replace values in 'falsemarriagepolicy' based on conditions
df3.loc[(df3['yom'] > 1982) & (df3['stateid'] == 28), 'falsemarriagepolicy'] = 1
df3.loc[(df3['yom'] > 1982) & (df3['stateid'] == 33), 'falsemarriagepolicy'] = 1
df3.loc[(df3['yom'] > 1982) & (df3['stateid'] == 27), 'falsemarriagepolicy'] = 1
df3.loc[(df3['yom'] > 1982) & (df3['stateid'] == 29), 'falsemarriagepolicy'] = 1


In [49]:
# filter

df3 = df3[~((df3['yom'] >= 1986) & (df3['stateid'] == 28))]
df3 = df3[~((df3['yom'] >= 1989) & (df3['stateid'] == 33))]
df3 = df3[~((df3['yom'] >= 1994) & (df3['stateid'] == 27))]
df3 = df3[~((df3['yom'] >= 1994) & (df3['stateid'] == 29))]

In [50]:
# table 2 column 1 
# drop all NAs
df3.dropna(subset=['eligwomaneducation', 'falsemarriagepolicy', 'marriageage', 'BPL', 'stateid', 'yom', 'men_age', 'women_age', 'landowner', 'caste', 'hh_total_income', 'household_occtype', 'landdowrypractice', 'samecastehusband', 'sameeconstatus', 'betteroffstatus', 'worseoffstatus', 'districtid'], inplace=True)

formula = "eligwomaneducation ~ falsemarriagepolicy + marriageage + BPL + C(yom) + men_age + women_age + C(caste) + hh_total_income + C(household_occtype) + landdowrypractice + samecastehusband + sameeconstatus + betteroffstatus + C(districtid)"

# Subset the data based on conditions
subset_data = df3[(df3['hindu'] == 1) & (df3['urban'] == 0) & (df3['landowner'] == 1) & (df3['hh_total_income'] > 1000) & (df3['hh_total_income'] < 160000) & (df3['hhsize'] <= 15)]

# Fit the regression model
model = smf.wls(formula, data=subset_data, weights=subset_data['weight']).fit(cov_type='cluster', cov_kwds={'groups': subset_data['districtid']})

# Display the summary of the regression results
print(model.summary())

# 0.08 falsemarriagepolicy matches

                            WLS Regression Results                            
Dep. Variable:     eligwomaneducation   R-squared:                       0.381
Model:                            WLS   Adj. R-squared:                  0.355
Method:                 Least Squares   F-statistic:                -1.337e+13
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               1.00
Time:                        20:52:08   Log-Likelihood:                -20189.
No. Observations:                7506   AIC:                         4.097e+04
Df Residuals:                    7208   BIC:                         4.304e+04
Df Model:                         297                                         
Covariance Type:              cluster                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         



In [51]:
data_paths = [
    "household-data_v1.dta"
]

df4 = [pd.read_stata(path) for path in data_paths]
df4 = pd.concat(df4)
df4

Unnamed: 0,hhid,weight,stateid,districtid,urban,brahmin,landowner,men_age,men_age2,men_age3,...,landless,marginalfarmer,smallfarmer,semi_medfarmer,mediumfarmer,largefarmer,farmer_type,hh_total_consumption,kids,marriagepolicy
0,20101021,611.10,2,201,0,0.0,1,37.0,1369.0,50653.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24420.0,3.0,0.0
1,20101051,611.10,2,201,0,0.0,0,35.5,1532.5,73733.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,165888.0,1.0,0.0
2,20101071,611.10,2,201,0,0.0,1,32.0,1024.0,32768.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,202968.0,3.0,0.0
3,20101081,611.10,2,201,0,0.0,1,39.0,1521.0,59319.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38808.0,1.0,0.0
4,20101082,611.10,2,201,0,0.0,1,24.0,576.0,13824.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,28896.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,333011100,5234.43,33,3330,1,0.0,0,47.0,2209.0,103823.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25800.0,0.0,0.0
30903,333011120,5234.43,33,3330,1,0.0,0,46.0,2116.0,97336.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26352.0,4.0,0.0
30904,333011130,5234.43,33,3330,1,0.0,0,52.0,3104.0,203008.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21300.0,2.0,1.0
30905,333011140,5234.43,33,3330,1,0.0,0,20.0,400.0,8000.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21996.0,0.0,0.0


In [52]:
# Drop observations based on conditions
df4 = df4[~((df4['stateid'] == 19) | (df4['stateid'] == 18) | (df4['stateid'] == 32) | (df4['northeaststate'] == 1))]

# Create a new column 'old' by summing 'num_old_men' and 'num_old_women'
df4['old'] = df4['num_old_men'] + df4['num_old_women']

# Drop observations where 'eligwomanage' is less than 15
df4 = df4[df4['eligwomanage'] >= 15]

# Drop observations where 'marriageage' is less than 6 or greater than 30
df4 = df4[(df4['marriageage'] >= 6) & (df4['marriageage'] <= 30)]

# Drop observations where 'hhsize' is greater than 15
df4 = df4[df4['hhsize'] <= 15]

# Drop observations where 'muslim' is 0
df4 = df4[df4['muslim'] != 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['old'] = df4['num_old_men'] + df4['num_old_women']


In [53]:
# table 3 column 2 
# drop all NAs
df4.dropna(subset=['eligwomaneducation', 'marriagepolicy', 'marriageage', 'BPL', 'stateid', 'yom', 'men_age', 'women_age', 'hh_total_income', 'household_occtype', 'landdowrypractice', 'samecastehusband', 'sameeconstatus', 'betteroffstatus', 'worseoffstatus', 'districtid'], inplace=True)

formula = "eligwomaneducation ~ marriagepolicy + marriageage + BPL + C(yom) + men_age + women_age + hh_total_income + C(household_occtype) + landdowrypractice + samecastehusband + sameeconstatus + betteroffstatus + C(districtid)"

# Subset the data based on conditions
subset_data = df4[(df4['urban'] == 0) & (df4['landowner'] == 1) & (df4['hh_total_income'] > 1000) & (df4['hh_total_income'] < 160000) & (df4['hhsize'] <= 15)]

# Fit the regression model
model = smf.wls(formula, data=subset_data, weights=subset_data['weight']).fit(cov_type='cluster', cov_kwds={'groups': subset_data['districtid']})

# Display the summary of the regression results
print(model.summary())

# 0.26 marriagepolicy matches

                            WLS Regression Results                            
Dep. Variable:     eligwomaneducation   R-squared:                       0.586
Model:                            WLS   Adj. R-squared:                  0.400
Method:                 Least Squares   F-statistic:                 3.454e+10
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        20:52:09   Log-Likelihood:                -1080.5
No. Observations:                 484   AIC:                             2463.
Df Residuals:                     333   BIC:                             3095.
Df Model:                         150                                         
Covariance Type:              cluster                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

