## Individual Level Data Set for EDA and Assumptions Testing

### 1. Ingest Data

In [1]:
# INGEST, PRE-PROCESS, CONCATENATE AND CARRY OUT EDA FOR INDIVIDUAL LEVEL DATA
# 1. INGEST DATA

# import modules and packages
import pandas as pd
import numpy as np
import altair as alt
from scipy import stats
import statsmodels.api as sm
import scipy.stats
from matplotlib import pyplot as plt
import seaborn as sns

# ingest data sets
y04 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years04_05_.csv")
y05 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years05_06_.csv")
y06 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_clarissa/years06_07_.csv")
y07 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years07_08_f.csv")
y08 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years08_09_f.csv")
y10 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years10_11_f.csv")
y11 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_john/years11_12_f.csv")
y12 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years12_13.csv")
y13 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years13_14.csv")
y14 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_rashaad/years14_15.csv")
y15 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years15_16_.csv")
y18 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years18_19_.csv")
y19 = pd.read_csv("/Users/johnowusuduah/github/uds-2022-ids-701-team-3/10_data_cleaning/clean_preet/years19_20_.csv")

### 2. Clean Data

In [2]:
# 2. CLEAN DATA
# 2004 - data cleaning for consistency
# drop unecessary columns
y04 = y04.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y04["hhcode"] = y04["hhcode"].astype("str")
y04['hhcode']= y04['hhcode'].apply(lambda x: x[0:-2])
# make sure first letter in province is capital
y04['province']= y04['province'].str.title()
# reindex columns for consistency across all data sets
y04i = y04.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y04i.sample(5)


Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
15536,3022001705,10,6,female,1,yes,yes,rural,3022,Nwfp,UpperDir,2004
7685,1252003702,13,5,female,1,no,no,rural,1252,Punjab,Khanewal,2004
5614,1172001606,9,3,male,1,yes,yes,rural,1172,Punjab,Narowal,2004
5335,1162001304,13,6,male,1,yes,yes,rural,1162,Punjab,Mandi Bahuddin,2004
7845,1262002004,7,4,male,1,yes,yes,rural,1262,Punjab,Pakpatten,2004


In [3]:
# 2005 - data cleaning for consistency
# drop unecessary columns
y05 = y05.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y05["hhcode"] = y05["hhcode"].astype("str")
y05['hhcode']= y05['hhcode'].apply(lambda x: x[0:-2])
# reindex columns for consistency across all data sets
y05i = y05.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y05i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
32009,4012030105,13,7,male,1,yes,yes,rural,4012,Balochistan,Quetta,2005
3576,1072030208,14,2,male,1,yes,yes,rural,1072,Punjab,Bhakar,2005
19959,2082020106,4,59,male,1,no,no,rural,2082,Sindh,Larkana,2005
12112,1292040201,4,54,male,1,yes,yes,rural,1292,Punjab,D.G.Khan,2005
9743,1212010216,8,53,female,1,no,no,rural,1212,Punjab,Okara,2005


In [4]:
# 2006 - data cleaning for consistency
# drop unecessary columns
y06 = y06.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y06["hhcode"] = y06["hhcode"].astype("str")
y06['hhcode']= y06['hhcode'].apply(lambda x: x[0:-2])
# convert integer values of region variable to string labels
y06.loc[y06["region"]==1, "region"] = 'urban'
y06.loc[y06["region"]==2, "region"] = 'rural'
# convert "integer-like" values of educational enrollment status to string labels
y06.loc[y06["currently_enrolled"]=="1.0", "currently_enrolled"] = 'yes'
y06.loc[y06["currently_enrolled"]=="2.0", "currently_enrolled"] = 'no' 
# convert integer values of ever admitted variable to string labels
y06.loc[y06["ever_admitted"]==1, "ever_admitted"] = 'yes'
y06.loc[y06["ever_admitted"]==2, "ever_admitted"] = 'no'
# reindex columns
y06i = y06.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y06i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
33665,1212002704,8,5,male,1,yes,yes,rural,1212,Punjab,Sheikhupura,2005
46820,1312000706,12,5,male,1,no,no,rural,1312,Punjab,Muzaffar Garh,2005
114630,3192001401,14,4,male,1,yes,no,rural,3192,NWFP,Kohistan,2005
14999,1102003201,12,5,male,1,yes,yes,rural,1102,Punjab,Jhang,2005
49588,1322001302,5,6,female,1,no,no,rural,1322,Punjab,Bahawalpur,2005


In [5]:
# 2007 - data cleaning for consistency
# drop unecessary columns
y07 = y07.drop(["Unnamed: 0"], axis=1)
# convert integer value of region variable to string label
y07.loc[y07["region"]=="3", "region"] = 'urban'
# reindex columns
y07i = y07.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y07i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
110955,2031040211,10,6,male,1,no,no,urban,2031,Sindh,Hyderabad,2007
173222,3031040204,10,62,female,1,yes,yes,urban,3031,KPK,Kohat,2007
121930,2051140410,8,57,female,1,yes,yes,urban,2051,Punjab,Karachi,2007
217440,3162040104,12,52,female,1,yes,no,rural,3162,KPK,Abbotabad,2007
99132,2011040306,5,54,female,1,yes,no,urban,2011,Sindh,Sukkur,2007


In [6]:
# 2008 - data cleaning for consistency
# drop unecessary columns
y08 = y08.drop(["Unnamed: 0"], axis=1)
# reindex columns
y08i = y08.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y08i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
90391,2082002605,6,9,female,1,no,,rural,2082,Sindh,Shikarpur,2008
175699,4152002410,7,6,male,1,yes,,rural,4152,Balochistan,Washuk,2008
160404,4022001214,12,6,female,1,yes,,rural,4022,Balochistan,Pashin,2008
184307,4242001106,5,4,female,1,no,,rural,4242,Balochistan,Qillah Saifullah,2008
112478,2212000605,5,4,male,1,no,,rural,2212,Sindh,Tharparkar,2008


In [7]:
# 2010 - data cleaning for consistency
# drop unecessary columns
y10 = y10.drop(["Unnamed: 0"], axis=1)
y10i = y10.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y10i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
13979,10911000807,9,7,male,1,yes,yes,urban,1091,Punjab,Faisalabad,2010
116934,22311008008,13,4,female,1,no,no,urban,2231,Sindh,Karachi,2010
77214,20212000512,12,4,female,1,yes,yes,urban,2021,Sindh,Sukkur,2010
8097,10512000910,9,4,female,1,yes,yes,urban,1051,Punjab,Sargodha,2010
176278,41710000911,12,4,male,1,yes,yes,urban,4171,Balochistan,Lasbella,2010


In [8]:
# 2011 - data cleaning for consistency
# drop unecessary columns
y11 = y11.drop(["Unnamed: 0"], axis=1)
y11i = y11.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y11i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
7800,1181120103,13,5,male,1,no,no,urban,1181,Punjab,Islamabad,2011
21250,2202040214,8,55,male,1,yes,yes,rural,2202,Sindh,Mir Pur Khas,2011
31960,4051040312,7,55,male,1,yes,yes,urban,4051,Balochistan,Zhob,2011
9619,1242040205,14,52,female,1,no,no,rural,1242,Punjab,Multan,2011
10851,1312010212,10,52,female,1,yes,yes,rural,1312,Punjab,Bahawalpur,2011


In [9]:
# 2012 data cleaning for consistency
# drop unnecessary columns
y12 = y12.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y12["hhcode"] = y12["hhcode"].astype("str")
y12['hhcode']= y12['hhcode'].apply(lambda x: x[0:-2])
# reindex columns
y12i = y12.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y12i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
67685,2632001006,8,5,male,1,no,yes,rural,2632,Punjab,Multan,2012
87280,3132001813,6,11,male,1,no,yes,rural,3132,Sindh,Hyderabad,2012
37993,2231000201,10,5,female,1,no,yes,urban,2231,Punjab,Muzaffar garh,2012
97688,3311000810,11,3,female,1,no,no,urban,3311,Sindh,Jaccobabad,2012
108367,3422002816,10,5,female,1,yes,yes,rural,3422,Sindh,Sanghar,2012


In [10]:
# data cleaning for consistency
# drop unnecessary columns
y13 = y13.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y13["hhcode"] = y13["hhcode"].astype("str")
y13['hhcode']= y13['hhcode'].apply(lambda x: x[0:-2])
# rename stratum to subprovince for consistency
y13.rename(columns={"stratum":"subprovince"}, inplace=True)
# reindex columns
y13i = y13.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y13i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
65168,4424100301,5,54,female,1,no,no,urban,4424,Balochistan,Jhal Magsi,2013
21011,2454200508,8,53,female,1,yes,yes,rural,2454,Punjab,Mandi Bahuddin,2013
68283,4573100709,10,53,female,1,no,no,urban,4573,Balochistan,Mastung,2013
17133,2332200310,5,52,female,1,yes,yes,rural,2332,Punjab,Faisalabad,2013
18077,2332100215,6,54,male,1,yes,yes,urban,2332,Punjab,Jhang,2013


In [11]:
# data cleaning for consistency
# drop unnecessary columns
y14 = y14.drop(["Unnamed: 0"], axis=1)
# convert hhcode to non-decimal string
y14["hhcode"] = y14["hhcode"].astype("str")
y14['hhcode']= y14['hhcode'].apply(lambda x: x[0:-2])
# reindex columns
y14i = y14.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y14i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
143724,4241001905,5,4,female,1,no,no,urban,4241,Balochistan,Killa saifullah,2014
82413,2841003204,5,7,male,1,no,no,urban,2841,Punjab,Muzaffargarh,2014
122152,3351004603,12,4,male,1,yes,yes,urban,3351,Sindh,Tando mohammad khan,2014
103954,3141000205,9,8,female,1,yes,yes,urban,3141,Sindh,Larkana,2014
17948,1261002606,13,4,female,1,yes,no,urban,1261,K.P.K,Tor ghar,2014


In [12]:
# data cleaning for consistency
# drop unnecessary columns
y15 = y15.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y15["hhcode"] = y15["hhcode"].astype("str")
y15['hhcode']= y15['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y15.rename(columns={"Year":"year"}, inplace=True)
# reindex columns
y15i = y15.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y15i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
10178,1202020809,12,52,female,1,yes,yes,urban,1202,KPK,Abbottabad,2015
59250,3102031113,5,52,female,1,yes,yes,urban,3102,Sindh,Shahdadkot,2015
22889,1602040507,15,2,male,1,yes,yes,urban,1602,KPK,Lakki Marwat,2015
14788,1302011213,5,65,male,1,yes,yes,urban,1302,KPK,Swabi,2015
31253,2302031405,15,53,female,1,yes,yes,urban,2302,Punjab,T.T. Singh,2015


In [13]:
# data cleaning for consistency
# drop unnecessary columns
y18 = y18.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y18["hhcode"] = y18["hhcode"].astype("str")
y18['hhcode']= y18['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y18.rename(columns={"Year":"year"}, inplace=True)
y18i = y18.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y18i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
69049,3502309704,13,3,male,1,yes,yes,urban,3502,Sindh,Malir,2018
13252,1402203607,12,6,male,1,yes,yes,urban,1402,KPK,Khyber,2018
68019,3502305706,14,55,female,1,yes,yes,urban,3502,Sindh,Karachi East,2018
10540,1402100302,4,52,female,1,no,no,urban,1402,KPK,Khyber,2018
26211,2302101803,13,2,male,1,yes,yes,urban,2302,Punjab,Toba Tek Singh,2018


In [14]:
# data cleaning for consistency
# drop unnecessary columns
y19 = y19.drop(["id"], axis=1)
# convert hhcode to non-decimal string
y19["hhcode"] = y19["hhcode"].astype("str")
y19['hhcode']= y19['hhcode'].apply(lambda x: x[0:-2])
# rename Year for consistency
y19.rename(columns={"Year":"year"}, inplace=True)
y19i = y19.reindex(columns=["hhcode","age","idc","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province","subprovince","year"], copy=True)
# preview data
y19i.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
156768,2132004,15,5,female,1,yes,yes,urban,2132,Punjab,Jehlum,2019
11211,2021049,4,8,female,1,no,no,rural,2021,Punjab,Bahawalnagar,2019
91155,1121023,4,3,male,1,no,no,rural,1121,KPK,Khyber,2019
127959,1311009,10,5,female,1,no,no,rural,1311,KPK,Tor Garh,2019
36460,2251014,10,3,female,1,yes,yes,rural,2251,Punjab,Nankana Sahib,2019


### 3. Concatenate Individual Data to One Data Set and Filter Ages Between 4 and 10

In [15]:
# 3. CONCATENATE INDIVIDUAL DATA TO ONE DATA SET AND FILTER AGES BETWEEN 4 AND 10
# concatenate data sets
df = pd.concat([y04i,y05i,y06i,y07i,y08i,y10i,y11i,y12i,y13i,y14i,y15i,y18i,y19i],axis=0)

# correct mispelt values of province columns
df["province"] = df["province"].replace('Nwfp', "KPK")
df["province"] = df["province"].replace('NWFP', "KPK")
df["province"] = df["province"].replace('NWFP ', "KPK")
df["province"] = df["province"].replace('NWFP ', "NWFP")
df["province"] = df["province"].replace('Punjab ', "Punjab")
df["province"] = df["province"].replace('K.P.K', "KPK")
df["province"] = df["province"].replace('Islamabad', "Punjab")

# filter of children between the ages of 4 and 10
df = df[(df["age"]>4) & (df["age"]<10)].copy()

# preview a sample of the data
df.sample(5)


Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
107638,1211006,5,9,male,1,no,no,rural,1211,KPK,Mohmand,2019
46533,2902102304,6,54,female,1,yes,yes,urban,2902,Punjab,Rahim Yar Khan,2018
69950,4241017,8,4,male,1,yes,yes,rural,4241,Balochistan,Qilla Abdullah,2019
61408,1292001911,5,6,male,1,no,,rural,1292,Punjab,D.G. Khan,2008
171182,2182018,9,4,female,1,yes,yes,urban,2182,Punjab,Lahore,2019


In [16]:
# check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817869 entries, 2 to 280899
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   hhcode              817869 non-null  object
 1   age                 817869 non-null  int64 
 2   idc                 817869 non-null  int64 
 3   sex                 817869 non-null  object
 4   marital_status      817869 non-null  int64 
 5   ever_admitted       817860 non-null  object
 6   currently_enrolled  732568 non-null  object
 7   region              817869 non-null  object
 8   subprovince code    817869 non-null  int64 
 9   province            817869 non-null  object
 10  subprovince         817527 non-null  object
 11  year                817869 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 81.1+ MB


In [17]:
# check the distribution of numeric variables
df.describe()

Unnamed: 0,age,idc,marital_status,subprovince code,year
count,817869.0,817869.0,817869.0,817869.0,817869.0
mean,6.891179,22.167358,1.00006,2527.237138,2011.567674
std,1.378586,23.080148,0.013766,1073.911723,4.730123
min,5.0,1.0,1.0,1001.0,2004.0
25%,6.0,5.0,1.0,1352.0,2007.0
50%,7.0,7.0,1.0,2322.0,2011.0
75%,8.0,53.0,1.0,3223.0,2015.0
max,9.0,93.0,5.0,6114.0,2019.0


### 4. Preliminary EDA

In [18]:
# 4. PRELIMINARY EDA
# extract the number of observations in the data set
f'There are {df.shape[0]} observations in the data set.'

'There are 817869 observations in the data set.'

In [19]:
# check value counts of sex
df.sex.value_counts(normalize=True).reset_index()

Unnamed: 0,index,sex
0,male,0.52621
1,female,0.47379


In [20]:
# check value counts of educational enrollment
df.currently_enrolled.value_counts(normalize=True).reset_index()

Unnamed: 0,index,currently_enrolled
0,yes,0.598329
1,no,0.401671


In [21]:
# check value counts of ever admitted variable
df.ever_admitted.value_counts().reset_index()

Unnamed: 0,index,ever_admitted
0,yes,474226
1,no,343634


In [22]:
# check value counts of region variable
df.region.value_counts().reset_index()

Unnamed: 0,index,region
0,rural,476881
1,urban,340988


In [23]:
df.marital_status.value_counts().reset_index()

Unnamed: 0,index,marital_status
0,1,817851
1,3,8
2,5,7
3,2,2
4,4,1


In [24]:
# EDA FOR OVERALL DATA SET
# Pre-process Data for EDA
# convert string categorical variables to integer labels
df_1 = df.copy()
# convert sex
df_1.loc[df_1["sex"] == "male", "sex"] = 0
df_1.loc[df_1["sex"] == "female", "sex"] = 1
# convert educational enrollment
df_1.loc[df_1["currently_enrolled"] == "no", "currently_enrolled"] = 0
df_1.loc[df_1["currently_enrolled"] == "yes", "currently_enrolled"] = 1
# convert ever admitted
df_1.loc[df_1["ever_admitted"] == "no", "ever_admitted"] = 0
df_1.loc[df_1["ever_admitted"] == "yes", "ever_admitted"] = 1
# convert region
df_1.loc[df_1["region"] == "rural", "region"] = 0
df_1.loc[df_1["region"] == "urban", "region"] = 1

# preview data
df_1.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
46858,1122040104,9,53,0,1,1,0,0,1122,Punjab,Jhang,2007
32792,2402400102,6,57,1,1,1,1,1,2402,Punjab,Gujrat,2018
229675,3242020101,7,56,1,1,1,0,0,3242,KPK,LakkiMarwat,2007
15712,2222100115,8,53,0,1,1,1,1,2222,Punjab,Bhakhar,2013
94845,21210000908,6,7,1,1,1,1,1,2121,Sindh,Jamshoro,2010


### 5. Final EDA (treatment variable --> sex)

In [25]:
df_04 = df_1[df_1.year == 2019]

In [26]:
# 5. FINAL EDA
# For Difference-in-Difference Analysis, 
# check for balance across the treatment arm, ie. sex, 
# for age, ever_admitted and region
#age_mean = []
#ever_admitted_mean = []
#region_mean = []
for i in ["age", "ever_admitted", "region"]:
    female = df_04.loc[df_04.sex == 1, i].mean()
    male = df_04.loc[df_04.sex == 0, i].mean()
    pvalue = stats.ttest_ind(df_04.loc[df_04.sex == 1, i].values,df_04.loc[df_04.sex == 0, i].values,
    ).pvalue
    print(f"For {i}, the mean for females in the survey is {female:.3f},")
    print(f"the mean for males in the survey is {male:.3f},")
    print(f"and the p-value for this difference is {pvalue:.3f}")
    print("\n")

For age, the mean for females in the survey is 6.870,
the mean for males in the survey is 6.881,
and the p-value for this difference is 0.153


For ever_admitted, the mean for females in the survey is 0.619,
the mean for males in the survey is 0.702,
and the p-value for this difference is 0.000


For region, the mean for females in the survey is 0.254,
the mean for males in the survey is 0.251,
and the p-value for this difference is 0.234




In [27]:
f"We see that age, whether a student has been admitted in an educational institution, and region are statistically significantly different across both male and \
females for a majority of the years in the data set. This would invalidate causal inference analysis on the on enrollment between men and women. We could match men and women \
for years where there are baseline difference but we would be reducing the statistical power of our analysis. So we decided to investigate the causal impact of the taliban attacks \
on women in rural areas controlled by the taliban compared to women in rural areas not controlled by the taliban."


'We see that age, whether a student has been admitted in an educational institution, and region are statistically significantly different across both male and females for a majority of the years in the data set. This would invalidate causal inference analysis on the on enrollment between men and women. We could match men and women for years where there are baseline difference but we would be reducing the statistical power of our analysis. So we decided to investigate the causal impact of the taliban attacks on women in rural areas controlled by the taliban compared to women in rural areas not controlled by the taliban.'

In [28]:
# crosstab of treatment variable versus response variable
pd.crosstab(df["currently_enrolled"],df["sex"],margins=True)

sex,female,male,All
currently_enrolled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,155249,139002,294251
yes,192155,246162,438317
All,347404,385164,732568


In [29]:
# normalization for all variables have been done by rows so fractions
# are in terms of 
# normalized crosstab of treatment variable versus response variable
pd.crosstab(df["currently_enrolled"],df["sex"],margins=True,normalize="index")

sex,female,male
currently_enrolled,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.527607,0.472393
yes,0.438393,0.561607
All,0.474228,0.525772


In [30]:
# crosstab of treatment variable versus region
pd.crosstab(df["region"],df["sex"],margins=True)

sex,female,male,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rural,224273,252608,476881
urban,163225,177763,340988
All,387498,430371,817869


In [31]:
# normalized crosstab of treatment variable versus region
pd.crosstab(df["region"],df["sex"],margins=True,normalize="index")

sex,female,male
region,Unnamed: 1_level_1,Unnamed: 2_level_1
rural,0.470291,0.529709
urban,0.478683,0.521317
All,0.47379,0.52621


In [32]:
# crosstab of treatment variable versus ever_admitted
pd.crosstab(df["ever_admitted"],df["sex"],margins=True)

sex,female,male,All
ever_admitted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,178719,164915,343634
yes,208777,265449,474226
All,387496,430364,817860


In [33]:
df_1.groupby(["year","sex"])["currently_enrolled"].mean().reset_index()

Unnamed: 0,year,sex,currently_enrolled
0,2004,0,0.625517
1,2004,1,0.480572
2,2005,0,0.662636
3,2005,1,0.537512
4,2007,0,0.331634
5,2007,1,0.323851
6,2008,0,
7,2008,1,
8,2010,0,0.711212
9,2010,1,0.604087


In [34]:
# normalized crosstab of treatment variable versus ever_admitted
pd.crosstab(df["ever_admitted"],df["sex"],margins=True,normalize="index")

sex,female,male
ever_admitted,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.520085,0.479915
yes,0.440248,0.559752
All,0.473793,0.526207


In [35]:
# crosstab of treatment variable versus marital status
pd.crosstab(df["marital_status"],df["region"],margins=True)

region,rural,urban,All
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,476877,340974,817851
2,1,1,2
3,0,8,8
4,0,1,1
5,3,4,7
All,476881,340988,817869


In [36]:
# normalized crosstab of treatment variable versus marital status
# these values vary so much that we shall not test for whether the 
# difference is statistically different
pd.crosstab(df["marital_status"],df["region"],margins=True,normalize="columns")

region,rural,urban,All
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.999992,0.999959,0.999978
2,2e-06,3e-06,2e-06
3,0.0,2.3e-05,1e-05
4,0.0,3e-06,1e-06
5,6e-06,1.2e-05,9e-06


In [37]:
# crosstab of treatment variable province
pd.crosstab(df["province"],df["sex"],margins=True)

sex,female,male,All
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Balochistan,66007,77658,143665
KPK,82335,90732,173067
Punjab,148961,159082,308043
Sindh,90195,102899,193094
All,387498,430371,817869


In [38]:
# normalized crosstab of treatment variable province
pd.crosstab(df["province"],df["sex"],margins=True,normalize="index")

sex,female,male
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Balochistan,0.459451,0.540549
KPK,0.475741,0.524259
Punjab,0.483572,0.516428
Sindh,0.467104,0.532896
All,0.47379,0.52621


In [39]:
# crosstab of treatment variable year
pd.crosstab(df["year"],df["sex"],margins=True)

sex,female,male,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,5945,6764,12709
2005,39387,44083,83470
2007,60534,66302,126836
2008,40094,45207,85301
2010,40519,45265,85784
2011,7348,7961,15309
2012,35510,39419,74929
2013,16027,17844,33871
2014,39632,44227,83859
2015,21664,23256,44920


In [40]:
# normalized crosstab of treatment variable year
pd.crosstab(df["year"],df["sex"],margins=True,normalize="index")

sex,female,male
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.467779,0.532221
2005,0.47187,0.52813
2007,0.477262,0.522738
2008,0.47003,0.52997
2010,0.472337,0.527663
2011,0.479979,0.520021
2012,0.473915,0.526085
2013,0.473178,0.526822
2014,0.472603,0.527397
2015,0.48228,0.51772


### 6. Final EDA (treatment variable --> taliban controlled areas versus non-taliban controlled areas in rural areas)

In [41]:
# ### 6. FINAL EDA (TREATMENT VARIABLE --> taliban controlled areas versus non-taliban controlled in rural areas)
# a. Pre-processing of subprovince name ---> Ensure Consistency of Subprovince Names
# Replaces:
df_1["subprovince"] = df_1["subprovince"].replace("Ättock", "Attock")
df_1["subprovince"] = df_1["subprovince"].replace("Abbottabad", "Abbotabad")
df_1["subprovince"] = df_1["subprovince"].replace("Bahawalnagar", "Bahawal Nagar")
df_1["subprovince"] = df_1["subprovince"].replace("Bahawalnager", "Bahawal Nagar")

df_1["subprovince"] = df_1["subprovince"].replace("Bhakhar", "Bhakkar")
df_1["subprovince"] = df_1["subprovince"].replace("Bhakar", "Bhakkar")
df_1["subprovince"] = df_1["subprovince"].replace("Baddin", "Badin")

df_1["subprovince"] = df_1["subprovince"].replace("Barkhen", "Barkhan")

df_1["subprovince"] = df_1["subprovince"].replace("Bhawalpur", "Bahawalpur")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/ Kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/ kachhi", "Bolan/Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachni", "Bolan/Kachhi")

df_1["subprovince"] = df_1["subprovince"].replace("Bonair", "Buner")
df_1["subprovince"] = df_1["subprovince"].replace("Bunair", "Buner")


df_1["subprovince"] = df_1["subprovince"].replace("Chaghi", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Chaghai", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Chaghi", "Chagai")
df_1["subprovince"] = df_1["subprovince"].replace("Charsada", "Charsadda")
df_1["subprovince"] = df_1["subprovince"].replace("D. G. Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Ghazi Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. g. khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.G.Khan", "D.G. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.g khan", "D.G. Khan")


df_1["subprovince"] = df_1["subprovince"].replace("D.I.Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. i. khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D. I. Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Ismail Khan", "D.I. Khan")
df_1["subprovince"] = df_1["subprovince"].replace("D.i.khan", "D.I. Khan")

df_1["subprovince"] = df_1["subprovince"].replace("Dera bugti", "Dera Bugti")
df_1["subprovince"] = df_1["subprovince"].replace("Deara Bughti", "Dera Bugti")
df_1["subprovince"] = df_1["subprovince"].replace("Dera Bughti", "Dera Bugti")

df_1["subprovince"] = df_1["subprovince"].replace("Gujranwala", "Gujaranwala")
df_1["subprovince"] = df_1["subprovince"].replace("Gawadar", "Gwadar")

df_1["subprovince"] = df_1["subprovince"].replace("Haifzabad", "Hafizabad")
df_1["subprovince"] = df_1["subprovince"].replace("Hafaizabad", "Hafizabad")
df_1["subprovince"] = df_1["subprovince"].replace("Hzara", "Hazara")
df_1["subprovince"] = df_1["subprovince"].replace("Pak Pattain", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpaten", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpatan", "Pakpattan")
df_1["subprovince"] = df_1["subprovince"].replace("Pakpatten", "Pakpattan")

df_1["subprovince"] = df_1["subprovince"].replace("pishine", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pishine", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pashin", "Pishin")
df_1["subprovince"] = df_1["subprovince"].replace("Pershawar", "Peshawar")
df_1["subprovince"] = df_1["subprovince"].replace("Qillah Abdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("QillahAbdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa Abdullah", "Qilla Abdullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qilla abdullah", "Qilla Abdullah")

df_1["subprovince"] = df_1["subprovince"].replace("Qillah Saifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qillah Salifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("QillahSaifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa Saifullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Qilla saifullah", "Qilla Saifullah")

df_1["subprovince"] = df_1["subprovince"].replace("Quetta (Div)", "Quetta")
df_1["subprovince"] = df_1["subprovince"].replace("R.Y.Khan", "Rahim Yar Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Rahim yar khan", "Rahim Yar Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Rajaanpur", "Rajanpur")
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed Banazir Abad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed Benazirabad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace(
    "Shaheed benazir abad", "Shaheed Benazir Abad"
)
df_1["subprovince"] = df_1["subprovince"].replace("Sheerani", "Sherani")
df_1["subprovince"] = df_1["subprovince"].replace("Sibbi (Div)", "Sibbi")
df_1["subprovince"] = df_1["subprovince"].replace("Sibi", "Sibbi")
df_1["subprovince"] = df_1["subprovince"].replace("Sijawal", "Sujawal")
df_1["subprovince"] = df_1["subprovince"].replace("Sukkar", "Sukkur")
df_1["subprovince"] = df_1["subprovince"].replace("T.T. Singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.T.Singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.t. singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("T.t singh", "Toba Tek Singh")
df_1["subprovince"] = df_1["subprovince"].replace("Sarghodha", "Sargodha")
df_1["subprovince"] = df_1["subprovince"].replace("Sheani", "Sherani")
df_1["subprovince"] = df_1["subprovince"].replace("Shikarpur", "Shiokarpur")

df_1["subprovince"] = df_1["subprovince"].replace("Tando Muhammad", "Tando Muhammad Khan")
df_1["subprovince"] = df_1["subprovince"].replace("Tando Muhd Khan", "Tando Muhammad Khan")
df_1["subprovince"] = df_1["subprovince"].replace(
    "Tando mohammad khan", "Tando Muhammad Khan"
)
df_1["subprovince"] = df_1["subprovince"].replace("Tando mohd khan", "Tando Muhammad Khan")

df_1["subprovince"] = df_1["subprovince"].replace("TandoAllah Yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tando Allah Yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tando allah yar", "Tando Allahyar")
df_1["subprovince"] = df_1["subprovince"].replace("Tor Garh", "Torghar")
df_1["subprovince"] = df_1["subprovince"].replace("Tor ghar", "Torghar")
df_1["subprovince"] = df_1["subprovince"].replace("Torgarh", "Torghar")

df_1["subprovince"] = df_1["subprovince"].replace("Umer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Ümer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Upper dir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("UpperDir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Zhob (Div)", "Zhob")



In [42]:
# more pre-processing of subprovince names
df_1["subprovince"] = df_1["subprovince"].replace("Jaccobabad", "Jacobabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jaffarabad", "Jafarabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jafrabad", "Jafarabad")
df_1["subprovince"] = df_1["subprovince"].replace("Jhal magsi", "Jhal Magsi")
df_1["subprovince"] = df_1["subprovince"].replace("JhalMagsi", "Jhal Magsi")
df_1["subprovince"] = df_1["subprovince"].replace("Jehlum", "Jhelum")

df_1["subprovince"] = df_1["subprovince"].replace("Kachhi/ Bolan", "Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/kachhi", "Kachhi")
df_1["subprovince"] = df_1["subprovince"].replace("Bolan/Kachhi", "Kachhi")

df_1["subprovince"] = df_1["subprovince"].replace("Kalat (Div)", "Kalat")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi Central", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi East", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi Malir", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi South", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Karachi West", "Karachi")
df_1["subprovince"] = df_1["subprovince"].replace("Kashmore", "Kashmor")
df_1["subprovince"] = df_1["subprovince"].replace("Kech", "Kech/Turbat")
df_1["subprovince"] = df_1["subprovince"].replace("Ketch/Turbat", "Kech/Turbat")
df_1["subprovince"] = df_1["subprovince"].replace("Keych/turbat", "Kech/Turbat")

df_1["subprovince"] = df_1["subprovince"].replace("Killa abdullah", "Qilla Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Killa saifullah", "Killa Saifullah")
df_1["subprovince"] = df_1["subprovince"].replace("Lakki marwat", "Lakki Marwat")
df_1["subprovince"] = df_1["subprovince"].replace("LakkiMarwat", "Lakki Marwat")
df_1["subprovince"] = df_1["subprovince"].replace("Lasbella", "Lasbela")
df_1["subprovince"] = df_1["subprovince"].replace("Lasbilla", "Lasbela")
df_1["subprovince"] = df_1["subprovince"].replace("Layyah", "Layya")
df_1["subprovince"] = df_1["subprovince"].replace("Lodhrean", "Lodhran")

df_1["subprovince"] = df_1["subprovince"].replace("Lower dir", "Lower Dir")
df_1["subprovince"] = df_1["subprovince"].replace("LowerDir", "Lower Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Makran (Div)", "Makran")
df_1["subprovince"] = df_1["subprovince"].replace("Malakand Protected", "Malakand")
df_1["subprovince"] = df_1["subprovince"].replace("Malakand Protected Area", "Malakand")
df_1["subprovince"] = df_1["subprovince"].replace("Mandi Bahuddin", "Mandi Bahauddin")
df_1["subprovince"] = df_1["subprovince"].replace("Mandi bahauddin", "Mandi Bahauddin")
df_1["subprovince"] = df_1["subprovince"].replace("Manshera", "Mansehra")
df_1["subprovince"] = df_1["subprovince"].replace("Mir pur Khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("MirPurKhas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mirpur Khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mirpur khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("Mir pur khas", "Mir Pur Khas")
df_1["subprovince"] = df_1["subprovince"].replace("M ianwali", "Mianwali")
df_1["subprovince"] = df_1["subprovince"].replace("Mitiari", "Matiari")

df_1["subprovince"] = df_1["subprovince"].replace("MusaKhel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musa khel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musa", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Musakhel", "Musa Khel")
df_1["subprovince"] = df_1["subprovince"].replace("Muraffar Garh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Muzaffar garh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Muzaffargarh", "Muzaffar Garh")
df_1["subprovince"] = df_1["subprovince"].replace("Nankana Sahi", "Nankana Sahib")
df_1["subprovince"] = df_1["subprovince"].replace("Nankana sahib", "Nankana Sahib")
df_1["subprovince"] = df_1["subprovince"].replace("Naseerabad (Div)", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Nasirabad/ Tamboo", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Nasirabad/ tamboo", "Nasirabad")
df_1["subprovince"] = df_1["subprovince"].replace("Naushahro feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero Feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero Freoze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nowsehra", "Nowshera")
df_1["subprovince"] = df_1["subprovince"].replace("Nawabsha", "Nawabshah")
df_1["subprovince"] = df_1["subprovince"].replace("Nowshero feroze", "Naushahro Feroze")
df_1["subprovince"] = df_1["subprovince"].replace("Nauski", "Nushki")

df_1["subprovince"] = df_1["subprovince"].replace("Umer kot", "Umer Kot")
df_1["subprovince"] = df_1["subprovince"].replace("Upper dir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("UpperDir", "Upper Dir")
df_1["subprovince"] = df_1["subprovince"].replace("Zhob (Div)", "Zhob")


In [43]:
# Subset for rural areas and for women
df_r = df_1[(df_1["region"]==0) & (df_1["sex"]==1)].copy()

In [44]:
# add an indicator variable for whether an area is a taliban controlled area or not
taliabn_dominance = [
    "South Waziristan",
    "North Waziristan",
    "Orakzai",
    "Kurram",
    "Khyber",
    "Mohmand",
    "Bajur",
    "Darra Adamkhel",
    "Swat",
    "Upper Dir",
    "Lower Dir",
    "Bannu",
    "Lakki Marwat",
    "Tank",
    "Peshawar",
    "Dera Ismail Khan",
    "Mardan",
    "Charsadda",
    "Kohat"
]
df_r.loc[df_r["subprovince"].isin(taliabn_dominance), "taliban"] = 1
df_r.loc[~df_r["subprovince"].isin(taliabn_dominance), "taliban"] = 0
df_r.taliban = df_r.taliban.astype("int")
# preview value counts of observations in taliban controlled areas versus
df_r.taliban.value_counts()

0    206090
1     18183
Name: taliban, dtype: int64

In [45]:
# preview data set
df_r.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year,taliban
88630,1101018,7,3,1,1,1,1,0,1101,KPK,Haripur,2019,0
91626,1342020116,6,52,1,1,0,0,0,1342,Punjab,Bahawal Nagar,2007,0
236931,4012010306,8,55,1,1,1,1,0,4012,Balochistan,Quetta (Div),2007,0
150785,31820000502,7,5,1,1,1,1,0,3182,KPK,Batagram,2010,0
231712,3161013,6,5,1,1,1,1,0,3161,Sindh,Matiari,2019,0


In [46]:
# check for balance in the treatment group which comprises of taliban controlled groups and control group
# which comprises of areas not controlled by the taliban
# check for balance across the treatment arm, ie. taliban, 
# for age, ever_admitted and region
for i in ["age"]:
    taliban = df_r.loc[df_r.taliban == 1, i].mean()
    non_taliban = df_r.loc[df_r.taliban == 0, i].mean()
    pvalue = stats.ttest_ind(df_r.loc[df_r.taliban == 1, i].values,df_r.loc[df_r.taliban == 0, i].values,
    ).pvalue
    print(f"For {i}, the mean for taliban controlled areas in the survey is {taliban:.3f},")
    print(f"the mean for non-taliban controlled areas in the survey is {non_taliban:.3f},")
    print(f"and the p-value for this difference is {pvalue:.3f}")
    print("\n")

For age, the mean for taliban controlled areas in the survey is 6.860,
the mean for non-taliban controlled areas in the survey is 6.867,
and the p-value for this difference is 0.532




In [47]:
# check whether composition of intervention and comparision groups is stable 
# for repeated cross-sectional design
ctab = pd.crosstab(df_r["year"], df_r["taliban"], margins=True, normalize='index')
ctab

taliban,0,1
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.875332,0.124668
2005,0.895733,0.104267
2007,0.989078,0.010922
2008,0.961292,0.038708
2010,0.990867,0.009133
2011,0.992974,0.007026
2012,0.909643,0.090357
2013,0.869639,0.130361
2014,0.948333,0.051667
2015,0.922731,0.077269


In [48]:
chi2, p, dof, expected = scipy.stats.chi2_contingency(ctab.values)
f" The p-value between the treatment and control groups across cross-sections of the \
data is {p:0.3f}. So the composition of treatment and control groups is stable across \
cross-sections"


' The p-value between the treatment and control groups across cross-sections of the data is 0.998. So the composition of treatment and control groups is stable across cross-sections'

In [49]:
# so the difference is not statistically different across the 
# two groups for 

In [50]:
f"We see that age is not statistically significantly different between women in rural taliban controlled areas versus \
women in rural areas not controlled by the taliban. This shows that there are no baseline differences between these two groups and \
our approach to determine the causal inference of the terrorist attacks on the two groups using difference-in-difference is justified."

'We see that age is not statistically significantly different between women in rural taliban controlled areas versus women in rural areas not controlled by the taliban. This shows that there are no baseline differences between these two groups and our approach to determine the causal inference of the terrorist attacks on the two groups using difference-in-difference is justified.'