In [13]:
import pandas as pd

In [14]:
df1 = pd.read_stata("../00_source_data/1314_sec_2ab.dta", convert_categoricals=False)

In [15]:
df1.sample(10)

Unnamed: 0,hhcode,psu,province,region,idc,s2aq01,s2aq02,s2aq03,s2bq01,s2bq02,...,s2bq19d,s2bq19e,s2bq19f,s2bq19g,s2bq19h,s2bq19i,filter__,hhcode_new,stratum,psu_new
6479,1221100000.0,12211002,1,1,54,,,,1,10.0,...,,,,,,,0,1221010000.0,12210,12210102
33737,2333100000.0,23331002,2,1,2,2.0,,1.0,1,,...,,,,,,,0,2331030000.0,23310,23310302
27583,2214100000.0,22141001,2,1,52,,,,3,,...,60.0,0.0,0.0,0.0,500.0,2000.0,0,2211040000.0,22110,22110401
2008,1133100000.0,11331002,1,1,3,1.0,1.0,1.0,3,,...,0.0,0.0,0.0,0.0,160.0,1070.0,0,1131030000.0,11310,11310302
57087,2824100000.0,28241001,2,1,57,,,,3,,...,0.0,0.0,0.0,0.0,700.0,2200.0,0,2821040000.0,28210,28210401
31394,2313100000.0,23131004,2,1,2,1.0,1.0,1.0,3,,...,750.0,0.0,0.0,10000.0,700.0,16750.0,0,2311030000.0,23110,23110304
74986,3251100000.0,32511001,3,1,53,,,,3,,...,0.0,0.0,0.0,0.0,0.0,600.0,0,3251010000.0,32510,32510101
30016,2311210000.0,23112101,2,2,4,1.0,1.0,1.0,2,,...,,,,,,,0,2312110000.0,23121,23121101
102765,4414100000.0,44141001,4,1,5,2.0,,1.0,1,3.0,...,,,,,,,0,4401040000.0,44010,44010401
85342,3421200000.0,34212001,3,2,1,1.0,1.0,1.0,2,,...,,,,,,,0,3402010000.0,34020,34020101


In [16]:
# convert hhcode column to string for easy access
df1["hhcode"] = df1["hhcode"].astype("str")

# cleaning important columns
df1["ever_admitted"] = "no"
df1.loc[(df1["s2bq01"] == 1 | 2), "ever_admitted"] = "yes"

df1["currently_enrolled"] = "no"
df1.loc[(df1["s2bq01"] == 3), "currently_enrolled"] = "yes"

In [17]:
# Reducing to the questions of interest:
df1_rel = df1[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "currently_enrolled",
        "region",
        "province",
        'stratum'
    ]
].copy()

In [18]:
df10 = pd.read_stata("../00_source_data/1314_roster.dta")

In [19]:
df10["hhcode"] = df10["hhcode"].astype("str")

df10["sex"] = df10["s1aq04"]
df10["marital_status"] = df10["s1aq07"]

In [20]:
df10_rel = df10[["hhcode", "age", "idc", "sex", "marital_status"]].copy()

In [21]:
years13_14 = pd.merge(df1_rel, df10_rel, on=["hhcode", "idc"], how='inner', indicator=False)

In [22]:
years13_14.loc[(years13_14["region"] == 1), "region"] = "urban"
years13_14.loc[(years13_14["region"] == 2), "region"] = "rural"

# filtering the age for individuals between the ages of 4 - 15
years13_14 = years13_14.loc[(years13_14["age"] > 4) & (years13_14["age"] <= 10)]

years13_14.loc[(years13_14["region"] == 1), "region"] = "urban"
years13_14.loc[(years13_14["region"] == 2), "region"] = "rural"

years13_14["subprovince code"] = years13_14["hhcode"].apply(lambda x: x[0:4])
years13_14["subprovince code"] = years13_14["subprovince code"].astype("int")

#years13_14["province"] = years13_14["province"].astype("int")

years13_14["province"] = years13_14["province"].map(
    {1: "K.P.K", 2: "Punjab", 3: "Sindh", 4: "Balochistan"}
)

# modifying marital status to align with desired output
years13_14["marital_status"].replace(
    [
        "unmarried/never married",
        "currently married",
        "widow",
        " divorced",
        "nikkah has been solemnised but the rukhsati has not taken place code",
    ],
    [1, 2, 3, 4, 5],
    inplace=True,
)

# creating year column with lowest year as value
years13_14["year"] = 2013

years13_14.rename(columns = {'stratum':'code'}, inplace = True)

In [23]:
strat = pd.read_csv('../00_source_data/1314_stratum.csv', usecols=['stratum', 'code'])

In [24]:
years13_14 = pd.merge(years13_14, strat, on=['code'], how='inner', indicator=False)

In [25]:
years13_14 = years13_14.drop('code', 1)

In [26]:
years13_14.rename(columns = {'stratum':'subprovince'}, inplace = True)

In [27]:
years13_14.sample()

Unnamed: 0,hhcode,idc,ever_admitted,currently_enrolled,region,province,age,sex,marital_status,subprovince code,year,subprovince
26710,3543200211.0,53,no,no,rural,Sindh,5,male,1,3543,2013,Karachi East


In [28]:
# PRE-PROCESSING FOR DIFF-IN-DIFF DATA
# convert currently_enrolled from string to integer
years13_14["currently_enrolled"].replace("yes", 1, inplace=True)
years13_14["currently_enrolled"].replace("no", 0, inplace=True)
# check region for anomalies
#years07_08_f["region"].replace(3, "urban", inplace=True)
# aggregate dataset for sample population
df_grp_1 = years13_14.groupby(["sex","subprovince","region"])["currently_enrolled"].count().reset_index()
# aggregate dataset for enrollment total
df_grp_2 = years13_14.groupby(["sex","subprovince","region"])["currently_enrolled"].sum().reset_index()
# merge data set
df_grp_merge = pd.merge(df_grp_1, df_grp_2, on=["sex","subprovince","region"], indicator=True)
# check merge
df_grp_merge._merge.value_counts()

both          472
right_only      0
left_only       0
Name: _merge, dtype: int64

In [29]:
# compute enrollment_rate
df_grp_merge["rate_enrollment"] = df_grp_merge["currently_enrolled_y"]/df_grp_merge["currently_enrolled_x"]
# rename columns
df_grp_merge = df_grp_merge.rename(columns={"currently_enrolled_x":"sample_population", "currently_enrolled_y":"enrolled_total"})
# drop "_merge" column
df_grp_merge.drop(["_merge"], axis=1, inplace=True)
df_grp_merge.sample(5)

Unnamed: 0,sex,subprovince,region,sample_population,enrolled_total,rate_enrollment
90,male,Kashmore,rural,65,48.0,0.738462
70,male,Jehlum,rural,56,47.0,0.839286
84,male,Karachi South,rural,71,65.0,0.915493
233,male,Zhob,urban,316,145.0,0.458861
86,male,Karachi West,rural,76,55.0,0.723684


In [30]:
# save clean difference-in-difference data set
df_grp_merge.to_csv("../20_analysis/enrollment_clean/years13_14_merge_diff.csv")

In [31]:
#df_grp_merge.isna().any() # DO THIS BEFORE SUBMITTING THE CSV

In [32]:
years13_14.to_csv("clean_rashaad/years13_14.csv")