## PSLM 2007-2008

## Ingest Education Data Set

In [1]:
# Ingest data on education
import pandas as pd
pd.set_option('display.max_columns', None)

C = pd.read_stata(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/0708_sec2a.dta", convert_categoricals=False
    )
print(C.shape)
C.sample(5)


(96463, 29)


Unnamed: 0,hhcode,sec,idc,s2aq01,s2aq02,s2aq03,s2bq01,s2bq02,s2bq03,s2bq04,s2bq05,s2bq06,s2bq07,s2bq08,s2bq09,s2bq10,s2bq11,s2bq12,s2bq13,s2bq14,s2bq15,s2bq16,s2bq17,s2bq18,s2bq19a,s2bq19b,s2bq19c,province,region
66655,3051020000.0,02A,2,1.0,1.0,1.0,3.0,,,,,,,,,,2.0,1.0,5.0,9.0,1.0,8.0,6.0,1.0,6440.0,4000.0,10440.0,3,1
63623,3022040000.0,02A,2,1.0,1.0,1.0,2.0,,1.0,3.0,10.0,5.0,2.0,,,17.0,,,,,,,,,,,,3,2
88354,4032040000.0,02A,1,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,4,2
74503,3132010000.0,02A,53,,,,3.0,,,,,,,,,,2.0,1.0,5.0,0.0,,,,1.0,0.0,1200.0,1200.0,3,2
78393,3212010000.0,02A,53,1.0,1.0,1.0,2.0,,1.0,2.0,5.0,5.0,,,,,,,,,,,,,,,,3,2


In [2]:
# s2bq01 is the variable that has 
# 1 - never attended school/institution
# 2 - attended school/institution in the past
# 3 - currently attending school/institution
C.s2bq01.value_counts()

1.0    41671
2.0    28415
3.0    26365
5.0        1
Name: s2bq01, dtype: int64

In [3]:
# PRE-PROCESS DATA FROM EDUCATION SECTION
# create "ever_admitted" column
yes_admitted = [2,3]
C.loc[C["s2bq01"].isin(yes_admitted), "ever_admitted"] = 'yes'
C.loc[C["s2bq01"] == 1, "ever_admitted"] = 'no'
# create "currently_enrolled" column
C.loc[C["s2bq01"] == 3, "currently_enrolled"] = 'yes'
C.loc[C["s2bq01"] != 3, "currently_enrolled"] = 'no'

# Optional Columns from Education Section
# Note that "max_level_achieved" and "why_not" have integer encoded, refer to questionare
# create "max_level_achieved" column --> integer encoding already done
C["max_level_achieved"] = C["s2bq05"]

# create "why_not" column --> integer encoding already done
C["why_not"] = C["s2bq02"]
# preview data
C.sample(5)

Unnamed: 0,hhcode,sec,idc,s2aq01,s2aq02,s2aq03,s2bq01,s2bq02,s2bq03,s2bq04,s2bq05,s2bq06,s2bq07,s2bq08,s2bq09,s2bq10,s2bq11,s2bq12,s2bq13,s2bq14,s2bq15,s2bq16,s2bq17,s2bq18,s2bq19a,s2bq19b,s2bq19c,province,region,ever_admitted,currently_enrolled,max_level_achieved,why_not
30020,1222040000.0,02A,57,1.0,1.0,1.0,2.0,,1.0,1.0,5.0,3.0,2.0,,,6.0,,,,,,,,,,,,1,2,yes,no,5.0,
66678,3051030000.0,02A,1,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,3,1,no,no,,
89398,4042010000.0,02A,1,1.0,1.0,1.0,2.0,,1.0,3.0,8.0,6.0,,,,,,,,,,,,,,,,4,2,yes,no,8.0,
41848,2031020000.0,02A,1,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,2,1,no,no,,
94126,4062020000.0,02A,1,2.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,4,2,no,no,,


In [4]:
# subset relevant columns
C_relevant = C[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "max_level_achieved",
        "currently_enrolled",
        "why_not",
        "region",
        "province",
    ]
].copy()

C_relevant


Unnamed: 0,hhcode,idc,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province
0,1.011010e+09,1,yes,5.0,no,,1,1
1,1.011010e+09,2,yes,2.0,no,,1,1
2,1.011010e+09,51,no,,no,,1,1
3,1.011010e+09,52,no,,no,11.0,1,1
4,1.011010e+09,53,no,,no,,1,1
...,...,...,...,...,...,...,...,...
96458,4.071340e+09,53,yes,,yes,,1,4
96459,4.071340e+09,1,yes,14.0,no,,1,4
96460,4.071340e+09,51,yes,16.0,no,,1,4
96461,4.071340e+09,52,yes,,yes,,1,4


## Ingest Demography Data Set

In [5]:
# Ingest data on demographics
B = pd.read_stata(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/0708_roster.dta", convert_categoricals=False
)
B.sample(5)

Unnamed: 0,hhcode,sec,idc,s1aq02,s1aq03,s1aq04,age,s1aq05a,s1aq05b,s1aq05c,s1aq06,s1aq07,s1aq08,s1aq09,s1aq10,province,region,psu
7825,1052030000.0,01A,51,2,2,1,26,0,0,1981,2,1.0,99,99,1,1,2,10520301
76563,3071020000.0,01A,6,4,1,1,16,0,0,1991,1,,2,51,1,3,1,30710202
53612,2051241000.0,01A,52,3,2,1,3,23,2,2005,1,,1,51,1,2,1,20512409
102408,4052030000.0,01A,53,3,2,1,20,0,0,1988,1,,1,51,1,4,2,40520302
43916,2012030000.0,01A,3,3,1,1,10,0,0,1997,1,,1,51,1,2,2,20120301


In [6]:
# Change the names of the questions for readability
B['sex'] = B['s1aq03']
B['marital_status'] = B['s1aq06']

B_relevant = B[['hhcode', 'idc', 'age', 'sex', 'marital_status']].copy()

In [7]:
# Made the Household Code column a str type
B["hhcode"] = B["hhcode"].astype("str")
C["hhcode"] = C["hhcode"].astype("str")

In [8]:
# MERGE
years07_08 = pd.merge(B_relevant, C_relevant, on=["hhcode"], how='inner', indicator=True)

# Evaluate merge 
years07_08._merge.value_counts()

both          820891
left_only          0
right_only         0
Name: _merge, dtype: int64

In [9]:
# Select people in school enrollment age (I would say 4-15 years old)
years07_08 = years07_08.loc[(years07_08["age"] >= 4)&(years07_08["age"] <= 15)]
years07_08.sample(5)

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province,_merge
657667,3222030000.0,56,8,2,1,1,yes,10.0,no,,2,3,both
128110,1111330000.0,55,8,1,1,54,yes,,yes,,1,1,both
478523,2152020000.0,52,12,2,1,2,no,,no,11.0,2,2,both
162426,1141110000.0,7,15,1,1,10,yes,5.0,no,,1,1,both
544080,3052010000.0,61,4,1,1,55,no,,no,10.0,2,3,both


In [10]:
years07_08.province.value_counts()

1    96112
3    71747
2    68267
4    54618
Name: province, dtype: int64

In [11]:
# Lets see which hh codes we have:
years07_08["hhcode"] = years07_08["hhcode"].astype("str")
years07_08['subprovince code']= years07_08['hhcode'].apply(lambda x: x[0:4])
# remove the decimal characters from the string values
years07_08['hhcode']= years07_08['hhcode'].apply(lambda x: x[0:-2])
years07_08["subprovince code"] = years07_08["subprovince code"].astype("int")
# reformat region values
years07_08.loc[(years07_08["region"] == 1), 'region'] = 'urban'
years07_08.loc[(years07_08["region"] == 2), 'region'] = 'rural'
# reformat sex values
years07_08.loc[(years07_08["sex"] == 1), 'sex'] = 'male'
years07_08.loc[(years07_08["sex"] == 2), 'sex'] = 'female'
# reformat marital status values
#years07_08.loc[(years07_08["marital_status"] == 1), 'marital_status'] = 'unmarried/never married'
#years07_08.loc[(years07_08["marital_status"] == 2), 'marital_status'] = 'married'
#years07_08.loc[(years07_08["marital_status"] == 3), 'marital_status'] = 'widow/widower'
#years07_08.loc[(years07_08["marital_status"] == 4), 'marital_status'] = 'divorced'
#years07_08.loc[(years07_08["marital_status"] == 5), 'marital_status'] = 'Nikkah solemnised Rukshati not yet'


years07_08.head()

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province,_merge,subprovince code
33,1011010102,2,12,male,1,1,yes,18.0,no,,urban,1,both,1011
34,1011010102,2,12,male,1,2,yes,,yes,,urban,1,both,1011
35,1011010102,2,12,male,1,51,yes,12.0,no,,urban,1,both,1011
36,1011010102,2,12,male,1,52,yes,,yes,,urban,1,both,1011
37,1011010102,2,12,male,1,53,yes,,yes,,urban,1,both,1011


In [12]:
# Now we import the stratum name - code data (ie. encodings of province, 
# sub-province code and subprovince names)
strat_name = pd.read_csv(
    "/Users/johnowusuduah/Downloads/data_in_stata_2007_08/stratum0708.csv"
).iloc[:,0:3]
strat_name.sample(5)

Unnamed: 0,province,subprovince code,subprovince
10,Punjab,2061,Hyderabad
103,NWFP,3132,Hangu
37,Balochistan,4051,Zhob
78,Sindh,2032,Nawabshah
65,Punjab,1252,Multan


In [13]:
# MERGE
years07_08_ = pd.merge(years07_08, strat_name, on=["subprovince code"], how='inner')

# Evaluate merge 
years07_08_._merge.value_counts()

both          287494
left_only          0
right_only         0
Name: _merge, dtype: int64

In [14]:
# preview stat of data
years07_08_.sample(5)

Unnamed: 0,hhcode,idc_x,age,sex,marital_status,idc_y,ever_admitted,max_level_achieved,currently_enrolled,why_not,region,province_x,_merge,subprovince code,province_y,subprovince
65919,1202030103,53,6,female,1,1,no,,no,,rural,1,both,1202,Punjab,Kasur
276438,4061010105,53,11,female,1,3,yes,3.0,no,,urban,4,both,4061,Balochistan,Nasirabad
104107,2021020205,56,8,male,1,51,yes,5.0,no,,urban,2,both,2021,Sindh,Larkana
177358,3041010104,55,10,female,1,1,yes,5.0,no,,urban,3,both,3041,NWFP,D.I. Khan
156219,2142020110,52,6,male,1,1,no,,no,,rural,2,both,2142,Sindh,Mir pur Khas


In [15]:
# FINAL DATA PRE-PROCESSING
# subset columns of interest
years07_08_f = years07_08_[["hhcode","age","idc_x","sex","marital_status","ever_admitted","currently_enrolled","region","subprovince code","province_y","subprovince"]].copy()
# rename index and province variables
years07_08_f = years07_08_f.rename(columns={"idc_x":"idc","province_y":"province"})
# add year to data
years07_08_f["year"] = 2007
# change NWFP to KPK
years07_08_f.loc[years07_08_f["province"] == "NWFP", "province"] = "KPK"
# reformat columns/variables for consistency across all data sets
years07_08_f["sex"] = years07_08_f["sex"].astype("category")
# preview sample of data set
years07_08_f.sample(5)

Unnamed: 0,hhcode,age,idc,sex,marital_status,ever_admitted,currently_enrolled,region,subprovince code,province,subprovince,year
139327,2071230104,7,56,male,1,yes,yes,urban,2071,Punjab,Sukkar,2007
69605,1222010311,4,55,female,1,no,no,rural,1222,Punjab,Sheikhupura,2007
29451,1091210305,5,52,male,1,no,no,urban,1091,Punjab,Lahore,2007
32822,1092040103,8,53,male,1,no,no,rural,1092,Punjab,Mianwali,2007
284878,4071210108,8,53,female,1,yes,yes,urban,4071,Punjab,Quetta,2007


In [16]:
# check format of pre-processed data set
years07_08_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 287494 entries, 0 to 287493
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   hhcode              287494 non-null  object  
 1   age                 287494 non-null  int8    
 2   idc                 287494 non-null  int8    
 3   sex                 287494 non-null  category
 4   marital_status      287494 non-null  int8    
 5   ever_admitted       287474 non-null  object  
 6   currently_enrolled  287494 non-null  object  
 7   region              287494 non-null  object  
 8   subprovince code    287494 non-null  int64   
 9   province            287494 non-null  object  
 10  subprovince         287494 non-null  object  
 11  year                287494 non-null  int64   
dtypes: category(1), int64(2), int8(3), object(6)
memory usage: 20.8+ MB


In [17]:
years07_08_f.to_csv("~/Downloads/data_in_stata_2007_08/years07_08_f.csv")