## Individual Level Data Set for EDA and Assumptions Testing

### 1. Ingest Data

In [1]:
# INGEST, PRE-PROCESS, CONCATENATE AND CARRY OUT EDA FOR INDIVIDUAL LEVEL DATA
# 1. INGEST DATA

# import modules and packages
import pandas as pd
import numpy as np
import altair as alt
from scipy import stats
import statsmodels.api as sm
import scipy.stats
from matplotlib import pyplot as plt
import seaborn as sns

# ingest data sets
y04 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years04_05_.csv")
y05 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years05_06_.csv")
y06 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years06_07_.csv")
y07 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years07_08_f.csv")
y08 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years08_09_f.csv")
y10 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years10_11_f.csv")
y11 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years11_12_f.csv")
y12 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years12_13.csv")
y13 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years13_14.csv")
y14 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years14_15.csv")
y15 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years15_16_.csv")
y18 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years18_19_.csv")
y19 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years19_20_.csv")

### 2. Clean Data

In [2]:
# 2. CLEAN DATA
# 2004 - data cleaning for consistency
# drop unecessary columns
y04 = y04.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y04["hhcode"] = y04["hhcode"].astype("str")
y04['hhcode']= y04['hhcode'].apply(lambda x: x[0:-2])
# make sure first letter in province is capital
y04['province']= y04['province'].str.title()
# reindex columns for consistency across all data sets
y04i = y04.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y04i.sample(5)


Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
10864,2041000409,12,5,male,1,yes,yes,urban,2041,Sindh,Mir Pur Khas,2004
13415,2102002514,13,4,male,1,yes,yes,rural,2102,Sindh,Hyderabad,2004
12101,2072001715,14,7,male,1,yes,yes,rural,2072,Sindh,Shikarpur,2004
23487,4112001711,12,3,male,1,yes,yes,rural,4112,Balochistan,Khuzdar,2004
8377,1292001813,5,7,female,1,no,no,rural,1292,Punjab,Rajaanpur,2004


In [3]:
# 2005 - data cleaning for consistency
# drop unecessary columns
y05 = y05.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y05["hhcode"] = y05["hhcode"].astype("str")
y05['hhcode']= y05['hhcode'].apply(lambda x: x[0:-2])
# reindex columns for consistency across all data sets
y05i = y05.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y05i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
22788,2152040104,8,52,female,1,yes,yes,rural,2152,Sindh,Tharparkar,2005
37396,4071240210,7,53,male,1,yes,yes,urban,4071,Balochistan,Quetta,2005
11443,1262020103,15,52,female,1,yes,no,rural,1262,Punjab,Khanewal,2005
9530,1202020213,14,3,male,1,yes,no,rural,1202,Punjab,Kasur,2005
11315,1252030202,8,53,female,1,no,no,rural,1252,Punjab,Multan,2005


In [4]:
# 2006 - data cleaning for consistency
# drop unecessary columns
y06 = y06.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y06["hhcode"] = y06["hhcode"].astype("str")
y06['hhcode']= y06['hhcode'].apply(lambda x: x[0:-2])
# convert integer values of region variable to string labels
y06.loc[y06["region"]==1, "region"] = 'urban'
y06.loc[y06["region"]==2, "region"] = 'rural'
# convert "integer-like" values of educational enrollment status to string labels
y06.loc[y06["currently_enrolled"]=="1.0", "currently_enrolled"] = 'yes'
y06.loc[y06["currently_enrolled"]=="2.0", "currently_enrolled"] = 'no' 
# convert integer values of ever admitted variable to string labels
y06.loc[y06["ever_admitted"]==1, "ever_admitted"] = 'yes'
y06.loc[y06["ever_admitted"]==2, "ever_admitted"] = 'no'
# reindex columns
y06i = y06.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y06i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
110321,3142002107,11,5,male,1,yes,yes,rural,3142,NWFP,D.I.Khan,2005
99205,3072000508,12,4,male,1,yes,yes,rural,3072,NWFP,Bonair,2005
49949,1322002206,9,5,male,1,no,no,rural,1322,Punjab,Bahawalpur,2005
130349,4102000914,14,5,female,1,yes,yes,rural,4102,Balochistan,Mastung,2005
46367,1302002008,9,6,female,1,yes,yes,rural,1302,Punjab,Layyah,2005


In [5]:
# 2007 - data cleaning for consistency
# drop unecessary columns
y07 = y07.drop(["Unnamed: 0"], axis=1)
# convert integer value of region variable to string label
y07.loc[y07["region"]=="3", "region"] = 'urban'
# reindex columns
y07i = y07.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y07i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
82427,1292020210,7,54,male,1,no,no,rural,1292,Punjab,D.G.Khan,2007
273021,4052030214,6,54,female,1,no,no,rural,4052,Balochistan,Zhob (Div),2007
94932,1352030110,14,1,male,1,no,no,rural,1352,Punjab,R.Y.Khan,2007
211165,3142020106,5,55,male,1,yes,yes,rural,3142,KPK,Tank,2007
214506,3152040205,4,52,female,1,yes,yes,rural,3152,KPK,D.I.Khan,2007


In [6]:
# 2008 - data cleaning for consistency
# drop unecessary columns
y08 = y08.drop(["Unnamed: 0"], axis=1)
# reindex columns
y08i = y08.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y08i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
10133,1061000502,6,8,female,1,yes,,urban,1061,Punjab,Bhakhar,2008
39652,1181207406,7,5,male,1,yes,,urban,1181,Punjab,Lahore,2008
22235,1111001810,10,7,male,1,yes,,urban,1111,Punjab,T.T. Singh,2008
75870,2012002007,12,5,female,1,no,,rural,2012,Sindh,Khairpur,2008
37570,1181201711,4,6,male,1,yes,,urban,1181,Punjab,Lahore,2008


In [7]:
# 2010 - data cleaning for consistency
# drop unecessary columns
y10 = y10.drop(["Unnamed: 0"], axis=1)
y10i = y10.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y10i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
156481,32320000210,6,6,male,1,yes,yes,rural,3232,KPK,Mardan,2010
92677,21110000109,12,3,female,1,no,no,urban,2111,Sindh,Dadu,2010
64796,13220000112,6,8,male,1,no,no,rural,1322,Punjab,Bahawalnagar,2010
15993,10912003311,6,3,male,1,yes,yes,urban,1091,Punjab,Faisalabad,2010
31047,11710001010,15,3,female,1,yes,yes,urban,1171,Punjab,Mandi Bahuddin,2010


In [8]:
# 2011 - data cleaning for consistency
# drop unecessary columns
y11 = y11.drop(["Unnamed: 0"], axis=1)
y11i = y11.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y11i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
5447,1131120111,10,3,male,1,yes,yes,urban,1131,Punjab,Gujaranwala,2011
28220,3182030216,14,3,male,1,yes,yes,rural,3182,KPK,Abbottabad,2011
20167,2172040209,5,54,male,1,yes,yes,rural,2172,Sindh,Badin,2011
25417,3081210104,13,3,male,1,yes,yes,urban,3081,KPK,Peshawar,2011
31911,4051040303,6,57,female,1,no,no,urban,4051,Balochistan,Zhob,2011


In [9]:
# 2012 data cleaning for consistency
# drop unnecessary columns
y12 = y12.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y12["hhcode"] = y12["hhcode"].astype("str")
y12['hhcode']= y12['hhcode'].apply(lambda x: x[0:-2])
# reindex columns
y12i = y12.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y12i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
10556,1421000103,11,7,female,1,yes,yes,urban,1421,K.P.K,Karak,2012
147647,4622000708,5,7,female,1,no,no,rural,4622,Balochistan,Qilla saifullah,2012
105752,3412000403,6,7,male,1,no,no,rural,3412,Sindh,Mir pur khas,2012
66030,2622001409,6,6,male,1,no,no,rural,2622,Punjab,Lodhrean,2012
67279,2631202410,13,5,male,1,yes,yes,urban,2631,Punjab,Multan,2012


In [10]:
# data cleaning for consistency
# drop unnecessary columns
y13 = y13.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y13["hhcode"] = y13["hhcode"].astype("str")
y13['hhcode']= y13['hhcode'].apply(lambda x: x[0:-2])
# rename stratum to subprovince for consistency
y13.rename(columns={"stratum":"subprovince"}, inplace=True)
# reindex columns
y13i = y13.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y13i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
23486,2513200109,13,53,female,1,no,no,rural,2513,Punjab,Kasur,2013
38770,3251100210,9,55,female,1,no,no,urban,3251,Sindh,Nawabshah,2013
29330,2832200204,11,53,female,1,yes,yes,rural,2832,Punjab,Rajanpur,2013
30204,2831100111,13,3,male,1,yes,yes,urban,2831,Punjab,Layyah,2013
27033,2732200205,15,53,female,1,yes,yes,rural,2732,Punjab,Khanewal,2013


In [11]:
# data cleaning for consistency
# drop unnecessary columns
y14 = y14.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y14["hhcode"] = y14["hhcode"].astype("str")
y14['hhcode']= y14['hhcode'].apply(lambda x: x[0:-2])
# reindex columns
y14i = y14.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y14i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
133807,3441003608,8,5,male,1,yes,yes,urban,3441,Sindh,Tharparkar,2014
6543,1151002415,7,6,female,1,no,no,urban,1151,K.P.K,Shangla,2014
62707,2542000803,11,5,male,1,yes,yes,rural,2542,Punjab,Nankana sahib,2014
80085,2831000113,14,6,female,1,yes,no,urban,2831,Punjab,Layyah,2014
20382,1321002216,13,2,female,1,yes,yes,urban,1321,K.P.K,Swabi,2014


In [12]:
# data cleaning for consistency
# drop unnecessary columns
y15 = y15.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y15["hhcode"] = y15["hhcode"].astype("str")
y15['hhcode']= y15['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y15.rename(columns={"Year":"year"}, inplace=True)
# reindex columns
y15i = y15.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y15i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
97331,4501012507,14,53,female,1,no,no,rural,4501,Balochistan,Kalat,2015
18216,1402021012,10,52,female,1,yes,yes,urban,1402,KPK,Peshawar,2015
3624,1102022210,8,54,male,1,yes,yes,urban,1102,KPK,Bonair,2015
44824,2602020703,7,52,female,1,yes,yes,urban,2602,Punjab,Pakpatten,2015
41896,2502030401,7,53,male,1,yes,yes,urban,2502,Punjab,Sheikhupura,2015


In [13]:
# data cleaning for consistency
# drop unnecessary columns
y18 = y18.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y18["hhcode"] = y18["hhcode"].astype("str")
y18['hhcode']= y18['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y18.rename(columns={"Year":"year"}, inplace=True)
y18i = y18.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y18i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
4060,1131100806,5,53,female,1,yes,yes,rural,1131,KPK,Lower Dir,2018
50737,3202101007,5,52,male,1,no,no,urban,3202,Sindh,Ghotki,2018
61292,3502107502,12,4,male,1,yes,yes,urban,3502,Sindh,Karachi South,2018
71252,3502404601,6,54,female,1,yes,yes,urban,3502,Sindh,Karachi South,2018
53698,3302104311,8,52,male,1,yes,yes,urban,3302,Sindh,Tando Muhammad Khan,2018


In [14]:
# data cleaning for consistency
# drop unnecessary columns
y19 = y19.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y19["hhcode"] = y19["hhcode"].astype("str")
y19['hhcode']= y19['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y19.rename(columns={"Year":"year"}, inplace=True)
y19i = y19.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y19i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
890,1091010,4,6,male,1,no,no,rural,1091,KPK,Hangu,2019
119755,1261015,15,4,female,1,no,no,rural,1261,KPK,Shangla,2019
266480,4231004,7,10,male,1,no,no,rural,4231,Balochistan,Pishin,2019
35080,2241065,5,6,male,1,no,no,rural,2241,Punjab,Muzaffar Garh,2019
152832,2111025,4,8,male,1,no,no,rural,2111,Punjab,Hafizabad,2019


### 3. Concatenate Individual Data to One Data Set

In [15]:
# 3. CONCATENATE INDIVIDUAL DATA TO ONE DATA SET
# concatenate data sets
df = pd.concat([y04i,y05i,y06i,y07i,y08i,y10i,y11i,y12i,y13i,y14i,y15i,y18i,y19i],axis=0)

# correct mispelt values of province columns
df["province"] = df["province"].replace('Nwfp', "KPK")
df["province"] = df["province"].replace('NWFP', "KPK")
df["province"] = df["province"].replace('NWFP ', "KPK")
df["province"] = df["province"].replace('NWFP ', "NWFP")
df["province"] = df["province"].replace('Punjab ', "Punjab")
df["province"] = df["province"].replace('K.P.K', "KPK")
df["province"] = df["province"].replace('Islamabad', "Punjab")

# preview a sample of the data
df.sample(5)


Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
9178,1202011109,9,52,male,1,yes,yes,urban,1202,KPK,Tor Garh,2015
158440,32420001701,11,6,male,1,yes,yes,rural,3242,KPK,Swabi,2010
198236,3082020104,5,53,female,1,yes,yes,rural,3082,KPK,Charsada,2007
43186,2702200802,12,2,male,1,yes,yes,urban,2702,Punjab,Vehari,2018
112423,3162002706,14,2,female,1,yes,no,rural,3162,KPK,Manshera,2005


In [16]:
# check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1761674 entries, 0 to 280899
Data columns (total 12 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   hhcode              object
 1   age                 int64 
 2   idc                 int64 
 3   sex                 object
 4   marital_status      int64 
 5   ever_admitted       object
 6   currently_enrolled  object
 7   region              object
 8   subprovince code    int64 
 9   province            object
 10  subprovince         object
 11  year                int64 
dtypes: int64(5), object(7)
memory usage: 174.7+ MB


In [17]:
# check the distribution of numeric variables
df.describe()

Unnamed: 0,age,idc,marital_status,subprovince code,year
count,1761674.0,1761674.0,1761674.0,1761674.0,1761674.0
mean,9.31388,18.09143,1.001784,2497.202,2011.486
std,3.36419,21.70291,0.07355944,1073.295,4.740053
min,4.0,1.0,1.0,1001.0,2004.0
25%,6.0,4.0,1.0,1332.0,2007.0
50%,9.0,6.0,1.0,2302.0,2011.0
75%,12.0,52.0,1.0,3212.0,2015.0
max,15.0,93.0,5.0,6114.0,2019.0


### 4. Preliminary EDA

In [18]:
# 4. PRELIMINARY EDA
# extract the number of observations in the data set
f'There are {df.shape[0]} observations in the data set.'

'There are 1761674 observations in the data set.'

In [19]:
# check value counts of sex
df.sex.value_counts(normalize=True).reset_index()

Unnamed: 0,index,sex
0,male,0.527689
1,female,0.472311


In [20]:
# check value counts of educational enrollment
df.currently_enrolled.value_counts(normalize=True).reset_index()

Unnamed: 0,index,currently_enrolled
0,yes,0.581142
1,no,0.418858


In [21]:
# check value counts of ever admitted variable
df.ever_admitted.value_counts().reset_index()

Unnamed: 0,index,ever_admitted
0,yes,1105772
1,no,655882


In [22]:
# check value counts of region variable
df.region.value_counts().reset_index()

Unnamed: 0,index,region
0,rural,1009639
1,urban,752035


In [23]:
df.marital_status.value_counts().reset_index()

Unnamed: 0,index,marital_status
0,1,1760188
1,2,877
2,5,495
3,3,57
4,4,57


In [24]:
# EDA FOR OVERALL DATA SET
# Pre-process Data for EDA
# convert string categorical variables to integer labels
df_1 = df.copy()
# convert sex
df_1.loc[df_1["sex"] == "male", "sex"] = 0
df_1.loc[df_1["sex"] == "female", "sex"] = 1
# convert educational enrollment
df_1.loc[df_1["currently_enrolled"] == "no", "currently_enrolled"] = 0
df_1.loc[df_1["currently_enrolled"] == "yes", "currently_enrolled"] = 1
# convert ever admitted
df_1.loc[df_1["ever_admitted"] == "no", "ever_admitted"] = 0
df_1.loc[df_1["ever_admitted"] == "yes", "ever_admitted"] = 1
# convert region
df_1.loc[df_1["region"] == "rural", "region"] = 0
df_1.loc[df_1["region"] == "urban", "region"] = 1

# preview data
df_1.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
117438,3222001605,4,8,0,1,0,0.0,0,3222,KPK,LakkiMarwat,2005
129315,1321034,8,4,1,1,0,0.0,0,1321,KPK,Upper Dir,2019
27760,1132001001,8,6,1,1,1,,0,1132,Punjab,Gujrat,2008
60662,13020002813,6,7,0,1,1,1.0,0,1302,Punjab,Muraffar Garh,2010
9905,1091001407,6,5,1,1,1,1.0,1,1091,Punjab,Faisalabad,2005


### 5. Final EDA (treatment variable --> sex)

In [79]:
df_04 = df_1[df_1.year == 2019]

In [80]:
# 5. FINAL EDA
# For Difference-in-Difference Analysis, 
# check for balance across the treatment arm, ie. sex, 
# for age, ever_admitted and region
#age_mean = []
#ever_admitted_mean = []
#region_mean = []
for i in ["age", "ever_admitted", "region"]:
    female = df_04.loc[df_04.sex == 1, i].mean()
    male = df_04.loc[df_04.sex == 0, i].mean()
    pvalue = stats.ttest_ind(df_04.loc[df_04.sex == 1, i].values,df_04.loc[df_04.sex == 0, i].values,
    ).pvalue
    print(f"For {i}, the mean for females in the survey is {female:.3f},")
    print(f"the mean for males in the survey is {male:.3f},")
    print(f"and the p-value for this difference is {pvalue:.3f}")
    print("\n")

For age, the mean for females in the survey is 9.026,
the mean for males in the survey is 9.075,
and the p-value for this difference is 0.000


For ever_admitted, the mean for females in the survey is 0.617,
the mean for males in the survey is 0.718,
and the p-value for this difference is 0.000


For region, the mean for females in the survey is 0.264,
the mean for males in the survey is 0.260,
and the p-value for this difference is 0.010




In [27]:
f"We see that age, whether a student has been admitted in an educational institution, and region are statistically significantly different across both male and \
females for a majority of the years in the data set. This would invalidate causal inference analysis on the on enrollment between men and women. We could match men and women \
for years where there are baseline difference but we would be reducing the statistical power of our analysis. So we decided to investigate the causal impact of the taliban attacks \
on women in rural areas controlled by the taliban compared to women in rural areas not controlled by the taliban."


'We see that age, whether a student has been admitted in an educational institution, and region are statistically significantly different across both male and females for a majority of the years in the data set. This would invalidate causal inference analysis on the on enrollment between men and women. We could match men and women for years where there are baseline difference but we would be reducing the statistical power of our analysis. So we decided to investigate the causal impact of the taliban attacks on women in rural areas controlled by the taliban compared to women in rural areas not controlled by the taliban.'

In [28]:
# crosstab of treatment variable versus response variable
pd.crosstab(df["currently_enrolled"],df["sex"],margins=True)

sex,female,male,All
currently_enrolled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,355907,302757,658664
yes,387815,526043,913858
All,743722,828800,1572522


In [29]:
# normalization for all variables have been done by rows so fractions
# are in terms of 
# normalized crosstab of treatment variable versus response variable
pd.crosstab(df["currently_enrolled"],df["sex"],margins=True,normalize="index")

sex,female,male
currently_enrolled,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.540347,0.459653
yes,0.424371,0.575629
All,0.472949,0.527051


In [30]:
# crosstab of treatment variable versus region
pd.crosstab(df["region"],df["sex"],margins=True)

sex,female,male,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rural,472793,536846,1009639
urban,359265,392770,752035
All,832058,929616,1761674


In [31]:
# normalized crosstab of treatment variable versus region
pd.crosstab(df["region"],df["sex"],margins=True,normalize="index")

sex,female,male
region,Unnamed: 1_level_1,Unnamed: 2_level_1
rural,0.468279,0.531721
urban,0.477724,0.522276
All,0.472311,0.527689


In [32]:
# crosstab of treatment variable versus ever_admitted
pd.crosstab(df["ever_admitted"],df["sex"],margins=True)

sex,female,male,All
ever_admitted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,355778,300104,655882
yes,476269,629503,1105772
All,832047,929607,1761654


In [33]:
df_1.groupby(["year","sex"])["currently_enrolled"].mean().reset_index()

Unnamed: 0,year,sex,currently_enrolled
0,2004,0,0.654774
1,2004,1,0.465389
2,2005,0,0.649012
3,2005,1,0.489086
4,2007,0,0.33495
5,2007,1,0.326438
6,2008,0,
7,2008,1,
8,2010,0,0.695205
9,2010,1,0.562928


In [34]:
# normalized crosstab of treatment variable versus ever_admitted
pd.crosstab(df["ever_admitted"],df["sex"],margins=True,normalize="index")

sex,female,male
ever_admitted,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.542442,0.457558
yes,0.430712,0.569288
All,0.47231,0.52769


In [35]:
# crosstab of treatment variable versus marital status
pd.crosstab(df["marital_status"],df["region"],margins=True)

region,rural,urban,All
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1008610,751578,1760188
2,621,256,877
3,26,31,57
4,36,21,57
5,346,149,495
All,1009639,752035,1761674


In [81]:
# normalized crosstab of treatment variable versus marital status
# these values vary so much that we shall not test for whether the 
# difference is statistically different
pd.crosstab(df["marital_status"],df["region"],margins=True,normalize="columns")

region,rural,urban,All
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.998981,0.999392,0.999156
2,0.000615,0.00034,0.000498
3,2.6e-05,4.1e-05,3.2e-05
4,3.6e-05,2.8e-05,3.2e-05
5,0.000343,0.000198,0.000281


In [37]:
# crosstab of treatment variable province
pd.crosstab(df["province"],df["sex"],margins=True)

sex,female,male,All
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Balochistan,134927,165442,300369
KPK,176611,195168,371779
Punjab,332783,353984,686767
Sindh,187737,215022,402759
All,832058,929616,1761674


In [38]:
# normalized crosstab of treatment variable province
pd.crosstab(df["province"],df["sex"],margins=True,normalize="index")

sex,female,male
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Balochistan,0.449204,0.550796
KPK,0.475043,0.524957
Punjab,0.484565,0.515435
Sindh,0.466127,0.533873
All,0.472311,0.527689


In [39]:
# crosstab of treatment variable year
pd.crosstab(df["year"],df["sex"],margins=True)

sex,female,male,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,11947,13510,25457
2005,85535,96385,181920
2007,139218,148276,287494
2008,88336,100816,189152
2010,89285,101875,191160
2011,16443,17751,34194
2012,71154,80503,151657
2013,32732,36446,69178
2014,75855,86977,162832
2015,48803,53106,101909


In [40]:
# normalized crosstab of treatment variable year
pd.crosstab(df["year"],df["sex"],margins=True,normalize="index")

sex,female,male
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.469301,0.530699
2005,0.470179,0.529821
2007,0.484247,0.515753
2008,0.467011,0.532989
2010,0.467069,0.532931
2011,0.480874,0.519126
2012,0.469177,0.530823
2013,0.473156,0.526844
2014,0.465848,0.534152
2015,0.478888,0.521112


### 6. Final EDA (treatment variable --> taliban controlled areas versus non-taliban controlled areas in rural areas)

In [41]:
# ### 6. FINAL EDA (TREATMENT VARIABLE --> taliban controlled areas versus non-taliban controlled in rural areas)
# a. Pre-processing of subprovince name ---> Ensure Consistency of Subprovince Names
# Replaces:
df_1["subprovince"] = df_1["subprovince"].replace("Ättock", "Attock")
df_1["subprovince"] = df_1["subprovince"].replace("Abbottabad", "Abbotabad")
df_1["subprovince"] = df_1["subprovince"].replace("Bahawalnagar", "Bahawal Nagar")
df_1["subprovince"] = df_1["subprovince"].replace("Bahawalnager", "Bahawal Nagar")

df_1["subprovince"] = df_1["subprovince"].replace("Bhakhar", "Bhakkar")
df_1["subprovince"] = df_1["subprovince"].replace("Bhakar", "Bhakkar")
df_1["subprovince"] = df_1["subprovince"].replace("Baddin", "Badin")

df_1["subprovince"] = df_1["subprovince"].replace("Barkhen", "Barkhan")

df_1["subprovince"] = df_1["subprovince"].replace("Bhawalpur", "Bahawalpur")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/ Kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/ kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachni", "Bolan/Kachhi")

df_1["subprovince"] = df_1["subprovince"].replace("Bonair", "Buner")
df_1["subprovince"] = df_1["subprovince"].replace("Bunair", "Buner")


df_1["subprovince"] = df_1["subprovince"].replace("Chaghi", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Chaghai", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Chaghi", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Charsada", "Charsadda")
df_1["subprovince"] = df_1["subprovince"].replace("D. G. Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Ghazi Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. g. khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.G.Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.g khan", "D.G. Khan")


df_1["subprovince"] = df_1["subprovince"].replace("D.I.Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. i. khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. I. Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Ismail Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.i.khan", "D.I. Khan")

df_1["subprovince"] = df_1["subprovince"].replace("Dera bugti", "Dera Bugti")
df_1["subprovince"] = df_1["subprovince"].replace("Deara Bughti", "Dera Bugti")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Bughti", "Dera Bugti")

df_1["subprovince"] = df_1["subprovince"].replace("Gujranwala", "Gujaranwala")
df_1["subprovince"] = df_1["subprovince"].replace("Gawadar", "Gwadar")

df_1["subprovince"] = df_1["subprovince"].replace("Haifzabad", "Hafizabad")
df_1["subprovince"] = df_1["subprovince"].replace("Hafaizabad", "Hafizabad")
df_1["subprovince"] = df_1["subprovince"].replace("Hzara", "Hazara")
df_1["subprovince"] = df_1["subprovince"].replace("Pak Pattain", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpaten", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpatan", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpatten", "Pakpattan")

df_1["subprovince"] = df_1["subprovince"].replace("pishine", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pishine", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pashin", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pershawar", "Peshawar")
df_1["subprovince"] = df_1["subprovince"].replace("Qillah Abdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("QillahAbdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa Abdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qilla abdullah", "Qilla Abdullah")

df_1["subprovince"] = df_1["subprovince"].replace("Qillah Saifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qillah Salifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("QillahSaifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa Saifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qilla saifullah", "Qilla Saifullah")

df_1["subprovince"] = df_1["subprovince"].replace("Quetta (Div)", "Quetta")
df_1["subprovince"] = df_1["subprovince"].replace("R.Y.Khan", "Rahim Yar Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Rahim yar khan", "Rahim Yar Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Rajaanpur", "Rajanpur")
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed Banazir Abad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed Benazirabad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed benazir abad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace("Sheerani", "Sherani")
df_1["subprovince"] = df_1["subprovince"].replace("Sibbi (Div)", "Sibbi")
df_1["subprovince"] = df_1["subprovince"].replace("Sibi", "Sibbi")
df_1["subprovince"] = df_1["subprovince"].replace("Sijawal", "Sujawal")
df_1["subprovince"] = df_1["subprovince"].replace("Sukkar", "Sukkur")
df_1["subprovince"] = df_1["subprovince"].replace("T.T. Singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.T.Singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.t. singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.t singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("Sarghodha", "Sargodha")
df_1["subprovince"] = df_1["subprovince"].replace("Sheani", "Sherani")
df_1["subprovince"] = df_1["subprovince"].replace("Shikarpur", "Shiokarpur")

df_1["subprovince"] = df_1["subprovince"].replace("Tando Muhammad", "Tando Muhammad Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Tando Muhd Khan", "Tando Muhammad Khan")
df_1["subprovince"] = df_1["subprovince"].replace(
    "Tando mohammad khan", "Tando Muhammad Khan"
)
df_1["subprovince"] = df_1["subprovince"].replace("Tando mohd khan", "Tando Muhammad Khan")

df_1["subprovince"] = df_1["subprovince"].replace("TandoAllah Yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tando Allah Yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tando allah yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tor Garh", "Torghar")
df_1["subprovince"] = df_1["subprovince"].replace("Tor ghar", "Torghar")
df_1["subprovince"] = df_1["subprovince"].replace("Torgarh", "Torghar")

df_1["subprovince"] = df_1["subprovince"].replace("Umer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Ümer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Upper dir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("UpperDir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Zhob (Div)", "Zhob")



In [42]:
# more pre-processing of subprovince names
df_1["subprovince"] = df_1["subprovince"].replace("Jaccobabad", "Jacobabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jaffarabad", "Jafarabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jafrabad", "Jafarabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jhal magsi", "Jhal Magsi")
df_1["subprovince"] = df_1["subprovince"].replace("JhalMagsi", "Jhal Magsi")
df_1["subprovince"] = df_1["subprovince"].replace("Jehlum", "Jhelum")

df_1["subprovince"] = df_1["subprovince"].replace("Kachhi/ Bolan", "Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/kachhi", "Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachhi", "Kachhi")

df_1["subprovince"] = df_1["subprovince"].replace("Kalat (Div)", "Kalat")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi Central", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi East", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi Malir", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi South", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi West", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Kashmore", "Kashmor")
df_1["subprovince"] = df_1["subprovince"].replace("Kech", "Kech/Turbat")
df_1["subprovince"] = df_1["subprovince"].replace("Ketch/Turbat", "Kech/Turbat")
df_1["subprovince"] = df_1["subprovince"].replace("Keych/turbat", "Kech/Turbat")

df_1["subprovince"] = df_1["subprovince"].replace("Killa abdullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa saifullah", "Killa Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Lakki marwat", "Lakki Marwat")
df_1["subprovince"] = df_1["subprovince"].replace("LakkiMarwat", "Lakki Marwat")
df_1["subprovince"] = df_1["subprovince"].replace("Lasbella", "Lasbela")
df_1["subprovince"] = df_1["subprovince"].replace("Lasbilla", "Lasbela")
df_1["subprovince"] = df_1["subprovince"].replace("Layyah", "Layya")
df_1["subprovince"] = df_1["subprovince"].replace("Lodhrean", "Lodhran")

df_1["subprovince"] = df_1["subprovince"].replace("Lower dir", "Lower Dir")
df_1["subprovince"] = df_1["subprovince"].replace("LowerDir", "Lower Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Makran (Div)", "Makran")
df_1["subprovince"] = df_1["subprovince"].replace("Malakand Protected", "Malakand")
df_1["subprovince"] = df_1["subprovince"].replace("Malakand Protected Area", "Malakand")
df_1["subprovince"] = df_1["subprovince"].replace("Mandi Bahuddin", "Mandi Bahauddin")
df_1["subprovince"] = df_1["subprovince"].replace("Mandi bahauddin", "Mandi Bahauddin")
df_1["subprovince"] = df_1["subprovince"].replace("Manshera", "Mansehra")
df_1["subprovince"] = df_1["subprovince"].replace("Mir pur Khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("MirPurKhas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mirpur Khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mirpur khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mir pur khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("M ianwali", "Mianwali")
df_1["subprovince"] = df_1["subprovince"].replace("Mitiari", "Matiari")

df_1["subprovince"] = df_1["subprovince"].replace("MusaKhel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musa khel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musa", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musakhel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Muraffar Garh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Muzaffar garh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Muzaffargarh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Nankana Sahi", "Nankana Sahib")
df_1["subprovince"] = df_1["subprovince"].replace("Nankana sahib", "Nankana Sahib")
df_1["subprovince"] = df_1["subprovince"].replace("Naseerabad (Div)", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Nasirabad/ Tamboo", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Nasirabad/ tamboo", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Naushahro feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero Feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero Freoze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowsehra", "Nowshera")
df_1["subprovince"] = df_1["subprovince"].replace("Nawabsha", "Nawabshah")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nauski", "Nushki")

df_1["subprovince"] = df_1["subprovince"].replace("Umer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Upper dir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("UpperDir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Zhob (Div)", "Zhob")


In [43]:
# Subset for rural areas and for women
df_r = df_1[(df_1["region"]==0) & (df_1["sex"]==1)].copy()

In [44]:
# add an indicator variable for whether an area is a taliban controlled area or not
taliabn_dominance = [
    "South Waziristan",
    "North Waziristan",
    "Orakzai",
    "Kurram",
    "Khyber",
    "Mohmand",
    "Bajur",
    "Darra Adamkhel",
    "Swat",
    "Upper Dir",
    "Lower Dir",
    "Bannu",
    "Lakki Marwat",
    "Tank",
    "Peshawar",
    "Dera Ismail Khan",
    "Mardan",
    "Charsadda",
    "Kohat"
]
df_r.loc[df_r["subprovince"].isin(taliabn_dominance), "taliban"] = 1
df_r.loc[~df_r["subprovince"].isin(taliabn_dominance), "taliban"] = 0
df_r.taliban = df_r.taliban.astype("int")
# preview value counts of observations in taliban controlled areas versus
df_r.taliban.value_counts()

0    435053
1     37740
Name: taliban, dtype: int64

In [45]:
# preview data set
df_r.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year,taliban
275387,4041016,11,6,1,1,0,0.0,0,4041,Balochistan,Dera Bugti,2019,0
32349,4052020409,12,53,1,1,0,0.0,0,4052,Balochistan,Zhob,2011,0
265372,4231018,7,4,1,1,1,1.0,0,4231,Balochistan,Pishin,2019,0
17258,2081061,6,6,1,1,1,1.0,0,2081,Punjab,Faisalabad,2019,0
91219,2092000307,8,6,1,1,0,,0,2092,Sindh,Larkana,2008,0


In [46]:
# check for balance in the treatment group which comprises of taliban controlled groups and control group
# which comprises of areas not controlled by the taliban
# check for balance across the treatment arm, ie. taliban, 
# for age, ever_admitted and region
for i in ["age"]:
    taliban = df_r.loc[df_r.taliban == 1, i].mean()
    non_taliban = df_r.loc[df_r.taliban == 0, i].mean()
    pvalue = stats.ttest_ind(df_r.loc[df_r.taliban == 1, i].values,df_r.loc[df_r.taliban == 0, i].values,
    ).pvalue
    print(f"For {i}, the mean for taliban controlled areas in the survey is {taliban:.3f},")
    print(f"the mean for non-taliban controlled areas in the survey is {non_taliban:.3f},")
    print(f"and the p-value for this difference is {pvalue:.3f}")
    print("\n")

For age, the mean for taliban controlled areas in the survey is 9.149,
the mean for non-taliban controlled areas in the survey is 9.168,
and the p-value for this difference is 0.296




In [47]:
# check whether composition of intervention and comparision groups is stable 
# for repeated cross-sectional design
ctab = pd.crosstab(df_r["year"], df_r["taliban"], margins=True, normalize='index')
ctab

taliban,0,1
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.873027,0.126973
2005,0.896597,0.103403
2007,0.99014,0.00986
2008,0.961429,0.038571
2010,0.991007,0.008993
2011,0.992415,0.007585
2012,0.909115,0.090885
2013,0.875379,0.124621
2014,0.946188,0.053812
2015,0.920139,0.079861


In [48]:
chi2, p, dof, expected = scipy.stats.chi2_contingency(ctab.values)
f" The p-value between the treatment and control groups across cross-sections of the \
data is {p:0.3f}. So the composition of treatment and control groups is stable across \
cross-sections"


' The p-value between the treatment and control groups across cross-sections of the data is 0.998. So the composition of treatment and control groups is stable across cross-sections'

In [49]:
# so the difference is not statistically different across the 
# two groups for 

In [50]:
f"We see that age is not statistically significantly different between women in rural taliban controlled areas versus \
women in rural areas not controlled by the taliban. This shows that there are no baseline differences between these two groups and \
our approach to determine the causal inference of the terrorist attacks on the two groups using difference-in-difference is justified."

'We see that age is not statistically significantly different between women in rural taliban controlled areas versus women in rural areas not controlled by the taliban. This shows that there are no baseline differences between these two groups and our approach to determine the causal inference of the terrorist attacks on the two groups using difference-in-difference is justified.'