## PSLM 2007-2008

## Ingest Education Data Set

In [1]:
# Ingest data on education
import pandas as pd
pd.set_option('display.max_columns', None)

C = pd.read_stata(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/0708_sec2a.dta", convert_categoricals=False
    )
print(C.shape)
C.sample(5)


(96463, 29)


Unnamed: 0,hhcode,sec,idc,s2aq01,s2aq02,s2aq03,s2bq01,s2bq02,s2bq03,s2bq04,s2bq05,s2bq06,s2bq07,s2bq08,s2bq09,s2bq10,s2bq11,s2bq12,s2bq13,s2bq14,s2bq15,s2bq16,s2bq17,s2bq18,s2bq19a,s2bq19b,s2bq19c,province,region
63055,3021030000.0,02A,2,1.0,1.0,1.0,2.0,,1.0,2.0,7.0,5.0,2.0,,,11.0,,,,,,,,,,,,3,1
73177,3102030000.0,02A,53,,,,3.0,,,,,,,,,,2.0,1.0,6.0,1.0,1.0,0.0,0.0,1.0,2000.0,1200.0,3200.0,3,2
2059,1021030000.0,02A,52,,,,3.0,,,,,,,,,,1.0,2.0,8.0,0.0,,,,1.0,30.0,100.0,130.0,1,1
89637,4042020000.0,02A,1,2.0,,2.0,1.0,,,,,,,,,,,,,,,,,,,,,4,2
50681,2061220000.0,02A,2,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,2,1


In [2]:
# s2bq01 is the variable that has 
# 1 - never attended school/institution
# 2 - attended school/institution in the past
# 3 - currently attending school/institution
C.s2bq01.value_counts()

1.0    41671
2.0    28415
3.0    26365
5.0        1
Name: s2bq01, dtype: int64

In [3]:
# PRE-PROCESS DATA FROM EDUCATION SECTION
# create "ever_admitted" column
yes_admitted = [2,3]
C.loc[C["s2bq01"].isin(yes_admitted), "ever_admitted"] = 'yes'
C.loc[C["s2bq01"] == 1, "ever_admitted"] = 'no'
# create "currently_enrolled" column
C.loc[C["s2bq01"] == 3, "currently_enrolled"] = 'yes'
C.loc[C["s2bq01"] != 3, "currently_enrolled"] = 'no'

# Optional Columns from Education Section
# Note that "max_level_achieved" and "why_not" have integer encoded, refer to questionare
# create "max_level_achieved" column --> integer encoding already done
C["max_level_achieved"] = C["s2bq05"]

# create "why_not" column --> integer encoding already done
C["why_not"] = C["s2bq02"]
# preview data
C.sample(5)

Unnamed: 0,hhcode,sec,idc,s2aq01,s2aq02,s2aq03,s2bq01,s2bq02,s2bq03,s2bq04,s2bq05,s2bq06,s2bq07,s2bq08,s2bq09,s2bq10,s2bq11,s2bq12,s2bq13,s2bq14,s2bq15,s2bq16,s2bq17,s2bq18,s2bq19a,s2bq19b,s2bq19c,province,region,ever_admitted,currently_enrolled,max_level_achieved,why_not
77977,3202010000.0,02A,51,2.0,,2.0,1.0,,,,,,,,,,,,,,,,,,,,,3,2,no,no,,
77625,3192010000.0,02A,1,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,3,2,no,no,,
36412,1342010000.0,02A,52,2.0,,1.0,1.0,17.0,,,,,,,,,,,,,,,,,,,,1,2,no,no,,17.0
88545,4032041000.0,02A,52,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,4,2,no,no,,
83100,4012040000.0,02A,53,,,,1.0,10.0,,,,,,,,,,,,,,,,,,,,4,2,no,no,,10.0


In [4]:
# subset relevant columns
C_relevant = C[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "max_level_achieved",
        "currently_enrolled",
        "why_not",
        "region",
        "province",
    ]
].copy()

C_relevant


Unnamed: 0,hhcode,idc,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province
0,1.011010e+09,1,yes,5.0,no,,1,1
1,1.011010e+09,2,yes,2.0,no,,1,1
2,1.011010e+09,51,no,,no,,1,1
3,1.011010e+09,52,no,,no,11.0,1,1
4,1.011010e+09,53,no,,no,,1,1
...,...,...,...,...,...,...,...,...
96458,4.071340e+09,53,yes,,yes,,1,4
96459,4.071340e+09,1,yes,14.0,no,,1,4
96460,4.071340e+09,51,yes,16.0,no,,1,4
96461,4.071340e+09,52,yes,,yes,,1,4


## Ingest Demography Data Set

In [5]:
# Ingest data on demographics
B = pd.read_stata(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/0708_roster.dta", convert_categoricals=False
)
B.sample(5)

Unnamed: 0,hhcode,sec,idc,s1aq02,s1aq03,s1aq04,age,s1aq05a,s1aq05b,s1aq05c,s1aq06,s1aq07,s1aq08,s1aq09,s1aq10,province,region,psu
92267,4012030000.0,01A,2,3,1,1,25,0,0,1982,1,,1,51,1,4,2,40120304
71870,3032020000.0,01A,52,8,2,1,28,0,0,1979,2,2.0,99,99,1,3,2,30320201
13563,1091220000.0,01A,4,3,1,1,14,0,0,1993,1,,1,51,1,1,1,10912202
101440,4052010000.0,01A,53,3,2,1,3,0,0,2004,1,,1,51,1,4,2,40520102
52380,2051221000.0,01A,53,3,2,1,7,1,1,2000,1,,1,51,1,2,1,20512206


In [6]:
# Change the names of the questions for readability
B['sex'] = B['s1aq03']
B['marital_status'] = B['s1aq06']

B_relevant = B[['hhcode', 'idc', 'age', 'sex', 'marital_status']].copy()

In [7]:
# Made the Household Code column a str type
B["hhcode"] = B["hhcode"].astype("str")
C["hhcode"] = C["hhcode"].astype("str")

In [8]:
# MERGE
years07_08 = pd.merge(B_relevant, C_relevant, on=["hhcode"], how='inner', indicator=True)

# Evaluate merge 
years07_08._merge.value_counts()

both          820891
left_only          0
right_only         0
Name: _merge, dtype: int64

In [9]:
# Select people in school enrollment age (I would say 4-10 years old)
years07_08 = years07_08.loc[(years07_08["age"] >= 4)&(years07_08["age"] <= 10)]
years07_08.sample(5)

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province,_merge
736491,4032020000.0,52,7,1,1,1,no,,no,,2,4,both
696955,4012030000.0,56,5,2,1,2,yes,,yes,,2,4,both
644915,3202010000.0,52,9,1,1,52,no,,no,11.0,2,3,both
794692,4061040000.0,53,6,1,1,5,yes,,yes,,1,4,both
532694,3042010000.0,56,5,2,1,56,yes,,yes,,2,3,both


In [10]:
years07_08.province.value_counts()

1    57231
3    45723
2    43445
4    34474
Name: province, dtype: int64

In [11]:
# Lets see which hh codes we have:
years07_08["hhcode"] = years07_08["hhcode"].astype("str")
years07_08['subprovince code']= years07_08['hhcode'].apply(lambda x: x[0:4])
# remove the decimal characters from the string values
years07_08['hhcode']= years07_08['hhcode'].apply(lambda x: x[0:-2])
years07_08["subprovince code"] = years07_08["subprovince code"].astype("int")
# reformat region values
years07_08.loc[(years07_08["region"] == 1), 'region'] = 'urban'
years07_08.loc[(years07_08["region"] == 2), 'region'] = 'rural'
# reformat sex values
years07_08.loc[(years07_08["sex"] == 1), 'sex'] = 'male'
years07_08.loc[(years07_08["sex"] == 2), 'sex'] = 'female'
# reformat marital status values
#years07_08.loc[(years07_08["marital_status"] == 1), 'marital_status'] = 'unmarried/never married'
#years07_08.loc[(years07_08["marital_status"] == 2), 'marital_status'] = 'married'
#years07_08.loc[(years07_08["marital_status"] == 3), 'marital_status'] = 'widow/widower'
#years07_08.loc[(years07_08["marital_status"] == 4), 'marital_status'] = 'divorced'
#years07_08.loc[(years07_08["marital_status"] == 5), 'marital_status'] = 'Nikkah solemnised Rukshati not yet'


years07_08.head()

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province,_merge,subprovince code
73,1011010102,55,10,female,1,1,yes,18.0,no,,urban,1,both,1011
74,1011010102,55,10,female,1,2,yes,,yes,,urban,1,both,1011
75,1011010102,55,10,female,1,51,yes,12.0,no,,urban,1,both,1011
76,1011010102,55,10,female,1,52,yes,,yes,,urban,1,both,1011
77,1011010102,55,10,female,1,53,yes,,yes,,urban,1,both,1011


In [12]:
# Now we import the stratum name - code data (ie. encodings of province, 
# sub-province code and subprovince names)
strat_name = pd.read_csv(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/stratum0708.csv"
).iloc[:,0:3]
strat_name.sample(5)

Unnamed: 0,province,subprovince code,subprovince
54,Punjab,1142,Gujrat
70,Punjab,1302,Rajaanpur
116,Balochistan,4022,Sibbi (Div)
64,Punjab,1242,Sahiwal
31,NWFP,3061,Bannu


In [13]:
# MERGE
years07_08_ = pd.merge(years07_08, strat_name, on=["subprovince code"], how='inner')

# Evaluate merge 
years07_08_._merge.value_counts()

both          178876
left_only          0
right_only         0
Name: _merge, dtype: int64

In [14]:
# preview stat of data
years07_08_.sample(5)

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province_x,_merge,subprovince code,province_y,subprovince
42598,1222040406,53,6,female,1,1,no,,no,,rural,1,both,1222,Punjab,Sheikhupura
168859,4052020301,53,8,male,1,3,yes,,yes,,rural,4,both,4052,Balochistan,Zhob (Div)
34877,1162020109,55,6,male,1,3,yes,2.0,no,,rural,1,both,1162,Punjab,Haifzabad
144410,4011010101,53,5,male,1,3,yes,,yes,,urban,4,both,4011,Balochistan,Quetta
64212,2022020110,52,9,female,1,54,yes,,yes,,rural,2,both,2022,Sindh,Sukkar


In [15]:
# FINAL DATA PRE-PROCESSING
# subset columns of interest
years07_08_f = years07_08_[["hhcode","age","idc_x","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province_y","subprovince"]].copy()
# rename index and province variables
years07_08_f = years07_08_f.rename(columns={"idc_x":"idc","province_y":"province"})
# add year to data
years07_08_f["year"] = 2007
# change NWFP to KPK
years07_08_f.loc[years07_08_f["province"] == "NWFP", "province"] = "KPK"
# reformat columns/variables for consistency across all data sets
years07_08_f["sex"] = years07_08_f["sex"].astype("category")
# preview sample of data set
years07_08_f.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
38400,1202020211,8,54,female,1,yes,no,rural,1202,Punjab,Kasur,2007
20230,1101230207,5,53,male,1,yes,yes,urban,1101,Punjab,Faisalabad,2007
13913,1072020209,10,53,female,1,no,no,rural,1072,Punjab,Bhakar,2007
134253,3162030209,7,53,female,1,no,no,rural,3162,KPK,Manshera,2007
170316,4052030313,9,52,male,1,no,no,rural,4052,Balochistan,Zhob (Div),2007


In [16]:
# check format of pre-processed data set
years07_08_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178876 entries, 0 to 178875
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   hhcode              178876 non-null  object  
 1   age                 178876 non-null  int8    
 2   idc                 178876 non-null  int8    
 3   sex                 178876 non-null  category
 4   marital_status      178876 non-null  int8    
 5   ever_admitted       178862 non-null  object  
 6   currently_enrolled  178876 non-null  object  
 7   region              178876 non-null  object  
 8   subprovince code    178876 non-null  int64   
 9   province            178876 non-null  object  
 10  subprovince         178876 non-null  object  
 11  year                178876 non-null  int64   
dtypes: category(1), int64(2), int8(3), object(6)
memory usage: 13.0+ MB


In [17]:
years07_08_f.to_csv("~/Downloads/revised_age/years07_08_.csv")