In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# reading dataset for South Africa IPUMS
data_SA = pd.read_stata('/users/quasar/downloads/ipumsi_00029.dta')
data_SA.groupby('year').count()

Unnamed: 0_level_0,country,sample,serial,hhwt,pernum,perwt,resident,age,sex,edattain,edattaind,yrschool,empstat,empstatd,indgen,classwk,classwkd,inctot
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1996,3621164,3621164,3621164,3621164,3621164,3621164,0,3621164,3621164,3621164,3621164,3621164,3621164,3621164,3621164,3621164,3621164,3621164
2001,3725655,3725655,3725655,3725655,3725655,3725655,0,3725655,3725655,3725655,3725655,3725655,3725655,3725655,3725655,3725655,3725655,3725655
2007,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657,1047657
2011,4418594,4418594,4418594,4418594,4418594,4418594,0,4418594,4418594,4418594,4418594,4418594,4418594,4418594,0,0,0,4418594
2016,3328793,3328793,3328793,3328793,3328793,3328793,0,3328793,3328793,3328793,3328793,3328793,0,0,0,0,0,0


## Sector in economy - Agriculture, Industry, Unskilled services, Skilled services 

In [3]:
data_SA.head()

Unnamed: 0,country,year,sample,serial,hhwt,pernum,perwt,resident,age,sex,edattain,edattaind,yrschool,empstat,empstatd,indgen,classwk,classwkd,inctot
0,south africa,1996,south africa 1996,1000,10.23,1,10.49,,54,male,primary completed,primary (6 yrs) completed,7 years,inactive,"unable to work, disabled or health reasons",niu (not in universe),niu (not in universe),niu (not in universe),0.0
1,south africa,1996,south africa 1996,1000,10.23,2,11.8,,5,female,less than primary completed,no schooling,none or pre-school,niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),0.0
2,south africa,1996,south africa 1996,1000,10.23,3,11.17,,44,female,primary completed,primary (6 yrs) completed,6 years,unemployed,"unemployed, not specified",niu (not in universe),niu (not in universe),niu (not in universe),0.0
3,south africa,1996,south africa 1996,1000,10.23,4,14.6,,less than 1 year,female,niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),niu (not in universe),0.0
4,south africa,1996,south africa 1996,1000,10.23,5,11.04,,27,male,primary completed,lower secondary general completed,9 years,employed,"employed, not specified",construction,wage/salary worker,wage/salary worker,24000.0


In [4]:
data_SA['indgen'].replace(to_replace=['agriculture, fishing, and forestry'], value='Agriculture', inplace=True)
data_SA['indgen'].replace(to_replace=['construction', 'manufacturing' ,'mining and extraction'], value='Industry', inplace=True)
data_SA['indgen'].replace(to_replace=['hotels and restaurants','private household services',
                    'transportation, storage, and communications', 'wholesale and retail trade'], 
                    value='Unskilled service', inplace=True)
data_SA['indgen'].replace(to_replace=['education', 'financial services and insurance','health and social work',
                    'public administration and defense',
                    'other services', 'business services and real estate', 
                    'electricity, gas, water and waste management' ], 
                    value='Skilled service', inplace=True)

indexNames = data_SA[data_SA['indgen'].isin(['unknown', 'niu (not in universe)'])].index
data_SA = data_SA.drop(indexNames)
data_SA['indgen'].replace(to_replace=['niu (not in universe)', 'unknown'], value='Agriculture', inplace=True)
print("\nSector by labour share % ")
share = data_SA.groupby('year')['indgen'].value_counts(normalize=True)
share


Sector by labour share % 


year                   
1996  Unskilled service   0.33
      Skilled service     0.30
      Industry            0.27
      Agriculture         0.10
2001  Unskilled service   0.33
      Skilled service     0.33
      Industry            0.24
      Agriculture         0.09
2007  Skilled service     0.31
      Unskilled service   0.31
      Industry            0.27
      Agriculture         0.11
2011  Agriculture          NaN
      Industry             NaN
      Unskilled service    NaN
      Skilled service      NaN
2016  Agriculture          NaN
      Industry             NaN
      Unskilled service    NaN
      Skilled service      NaN
Name: indgen, dtype: float64

## Mean of incwage (Wage) by year and sector

In [6]:
print("\nMean of incwage by year and sector\n")
print(data_SA.groupby(['year','indgen', 'edattain'])['inctot'].mean())


Mean of incwage by year and sector

year  indgen             edattain                   
1996  Agriculture        niu (not in universe)                  NaN
                         less than primary completed     131,141.59
                         primary completed               176,235.95
                         secondary completed             702,128.85
                         university completed            761,150.28
                         unknown                         655,894.50
      Industry           niu (not in universe)                  NaN
                         less than primary completed     184,552.52
                         primary completed               286,558.57
                         secondary completed             532,443.61
                         university completed            649,571.45
                         unknown                         840,082.29
      Unskilled service  niu (not in universe)                  NaN
                         l

## Median of incwage by year and sector 

In [8]:
print("\nMean of incwage by year and sector\n")
print(data_SA.groupby(['year','indgen', 'classwk'])['inctot'].mean())


Mean of incwage by year and sector

year  indgen             classwk              
1996  Agriculture        niu (not in universe)            NaN
                         self-employed             594,954.22
                         wage/salary worker        160,801.13
                         unpaid worker             535,119.07
                         unknown/missing           333,084.49
      Industry           niu (not in universe)            NaN
                         self-employed             483,093.24
                         wage/salary worker        331,173.54
                         unpaid worker             575,673.54
                         unknown/missing           428,744.11
      Unskilled service  niu (not in universe)            NaN
                         self-employed             485,155.46
                         wage/salary worker        298,273.76
                         unpaid worker             510,446.50
                         unknown/missing        

In [10]:
print("\nMedian of incwage by year and sector\n")
print(data_SA.groupby(['year','indgen', 'edattain'])['inctot'].median())


Median of incwage by year and sector

year  indgen             edattain                   
1996  Agriculture        niu (not in universe)                NaN
                         less than primary completed     4,200.00
                         primary completed               4,200.00
                         secondary completed            36,000.00
                         university completed           63,000.00
                         unknown                         4,200.00
      Industry           niu (not in universe)                NaN
                         less than primary completed     9,000.00
                         primary completed              15,000.00
                         secondary completed            24,000.00
                         university completed          114,000.00
                         unknown                        15,000.00
      Unskilled service  niu (not in universe)                NaN
                         less than primary complet

In [13]:
data_SA['yrschool'].unique()

['9 years', '7 years', '8 years', '6 years', 'none or pre-school', ..., '4 years', '2 years', '1 year', 'niu (not in universe)', 'some tertiary']
Length: 18
Categories (18, object): ['none or pre-school' < '1 year' < '2 years' < '3 years' ... 'not specified' < 'some tertiary' < 'unknown/missing' < 'niu (not in universe)']

In [17]:
data_SA['yrschool'].replace(to_replace= ['none or pre-school'], value = 0, inplace=True)
data_SA['yrschool'].replace(to_replace= ['1 year'], value = 1, inplace=True)
data_SA['yrschool'].replace(to_replace= ['2 years'], value = 2, inplace=True)
data_SA['yrschool'].replace(to_replace= ['3 years'], value = 3 , inplace=True)
data_SA['yrschool'].replace(to_replace= ['4 years'], value = 4, inplace=True)
data_SA['yrschool'].replace(to_replace= ['5 years'], value = 5, inplace=True)
data_SA['yrschool'].replace(to_replace= ['6 years'], value = 6 , inplace=True)
data_SA['yrschool'].replace(to_replace= ['7 years'], value = 7, inplace=True)
data_SA['yrschool'].replace(to_replace= ['8 years'], value = 8, inplace=True)
data_SA['yrschool'].replace(to_replace= ['9 years'], value = 9 , inplace=True)
data_SA['yrschool'].replace(to_replace= ['10 years'], value = 10, inplace=True)
data_SA['yrschool'].replace(to_replace= ['11 years'], value = 11, inplace=True)
data_SA['yrschool'].replace(to_replace= ['12 years'], value = 12, inplace=True)
data_SA['yrschool'].replace(to_replace= ['13 years'], value = 13, inplace=True)
data_SA['yrschool'].replace(to_replace= ['14 years'], value = 14, inplace=True)
data_SA['yrschool'].replace(to_replace= ['15 years'], value = 15, inplace=True)
data_SA['yrschool'].replace(to_replace= ['16 years'], value = 16, inplace=True)
data_SA['yrschool'].replace(to_replace= ['17 years'], value = 17, inplace=True)
data_SA['yrschool'].replace(to_replace= ['18 years or more'], value = 18 , inplace=True)

data_SA['yrschool'].replace(to_replace= ['not specified'], value = 90, inplace=True)
data_SA['yrschool'].replace(to_replace= ['some primary'], value = 91, inplace=True)
data_SA['yrschool'].replace(to_replace= ['some technical after primary'], value = 92, inplace=True)
data_SA['yrschool'].replace(to_replace= ['some secondary'], value = 93, inplace=True)
data_SA['yrschool'].replace(to_replace= ['some tertiary'], value = 94, inplace=True)
data_SA['yrschool'].replace(to_replace= ['adult literacy'], value = 95, inplace=True)
data_SA['yrschool'].replace(to_replace= ['special education'], value = 96, inplace=True)
data_SA['yrschool'].replace(to_replace= ['unknown/missing'], value = 98, inplace=True)
data_SA['yrschool'].replace(to_replace= ['niu (not in universe)'], value = 99, inplace=True)

In [18]:
data_SA.yrschool.unique()

[9, 7, 8, 6, 0, ..., 4, 2, 1, 99, 94]
Length: 18
Categories (18, int64): [0 < 1 < 2 < 3 ... 90 < 94 < 98 < 99]

In [19]:
data_SA['indgen'].unique()

['Industry', 'Agriculture', 'Unskilled service', 'Skilled service', NaN]
Categories (4, object): ['Agriculture' < 'Industry' < 'Unskilled service' < 'Skilled service']

In [20]:
data_SA.shape

(9381952, 19)

In [21]:
# calculting mincer returns 
# use sector dummy and sex as control 

data_reg = pd.get_dummies(data_SA, columns=['indgen', 'sex'], drop_first=True)
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9381952 entries, 4 to 16141862
Data columns (total 21 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   country                   category
 1   year                      category
 2   sample                    category
 3   serial                    int32   
 4   hhwt                      float64 
 5   pernum                    int8    
 6   perwt                     float64 
 7   resident                  category
 8   age                       category
 9   edattain                  category
 10  edattaind                 category
 11  yrschool                  category
 12  empstat                   category
 13  empstatd                  category
 14  classwk                   category
 15  classwkd                  category
 16  inctot                    float64 
 17  indgen_Industry           uint8   
 18  indgen_Unskilled service  uint8   
 19  indgen_Skilled service    uint8   
 20  s

In [22]:
# transforming variables in educin 

"""
data_reg['educin'].replace(to_replace=['illiterate, less than primary'], value = 0 , inplace=True)
data_reg['educin'].replace(to_replace=['literate, less than primary'], value = 2 , inplace=True)
data_reg['educin'].replace(to_replace=['primary'], value = 5 , inplace=True)
data_reg['educin'].replace(to_replace=['middle'], value = 8 , inplace=True)
data_reg['educin'].replace(to_replace=['secondary'], value = 10 , inplace=True)
data_reg['educin'].replace(to_replace=['higher secondary'], value = 12 , inplace=True)
data_reg['educin'].replace(to_replace=['undergraduate or graduate'], value = 15 , inplace=True)
data_reg['educin'].replace(to_replace=['unknown'], value = 99 , inplace=True)
"""

In [22]:
data_reg['year'].unique()

['1996', '2001', '2007', '2011', '2016']
Categories (5, object): ['1996' < '2001' < '2007' < '2011' < '2016']

In [23]:
data_reg = data_reg.loc[data_reg.yrschool.isin([4,  0,  2,  3,  1,  6,  5, 12, 11,  8,  9, 15,  7, 13, 10,
       14, 17, 18, 16])]

In [25]:
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8637307 entries, 4 to 16141862
Data columns (total 21 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   country                   category
 1   year                      category
 2   sample                    category
 3   serial                    int32   
 4   hhwt                      float64 
 5   pernum                    int8    
 6   perwt                     float64 
 7   resident                  category
 8   age                       category
 9   edattain                  category
 10  edattaind                 category
 11  yrschool                  category
 12  empstat                   category
 13  empstatd                  category
 14  classwk                   category
 15  classwkd                  category
 16  inctot                    float64 
 17  indgen_Industry           uint8   
 18  indgen_Unskilled service  uint8   
 19  indgen_Skilled service    uint8   
 20  s

In [31]:
data_reg.year.unique()

['1996', '2001', '2007', '2011', '2016']
Categories (5, object): ['1996' < '2001' < '2007' < '2011' < '2016']

In [34]:
# Weekly income 
data_reg =  data_reg[data_reg['inctot'] != 0]

Y = data_reg[['year', 'inctot']]
X = data_reg[['year', 'sex_female', 'yrschool', 'indgen_Industry', 
                    'indgen_Unskilled service', 'indgen_Skilled service']]

(681012, 21)
(681012,)


In [37]:
# year 1996
Y_1996 = Y.loc[Y.year == '1996']['inctot']
X_1996 = data_reg.loc[data_reg.year == '1996']
X_1996_1 = X_1996[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

print(X_1996.shape)
print(Y_1996.shape)

# regression of log wage on sector and sex 
reg_1 = sm.OLS(endog= np.log(Y_1996), exog= sm.add_constant(X_1996_1))
result_1 = reg_1.fit()
print(result_1.summary())

# for calculting mincer returns - adding years of schooling 
X_1996_2 = X_1996[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

reg_2 = sm.OLS(endog= np.log(Y_1996), exog= sm.add_constant(X_1996_2))
result_2 = reg_2.fit()
print(result_2.summary())

(681012, 21)
(681012,)


  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                 2.322e+04
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:53:57   Log-Likelihood:            -1.2558e+06
No. Observations:              681012   AIC:                         2.512e+06
Df Residuals:                  681007   BIC:                         2.512e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                 3.706e+04
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:53:58   Log-Likelihood:            -1.2174e+06
No. Observations:              681012   AIC:                         2.435e+06
Df Residuals:                  681006   BIC:                         2.435e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [38]:
# year 2001
Y_2001 = Y.loc[Y.year == '2001']['inctot']
X_2001 = data_reg.loc[data_reg.year == '2001']
X_2001_1 = X_2001[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

print(X_2001.shape)
print(Y_2001.shape)

# regression of log wage on sector and sex 
reg_3 = sm.OLS(endog= np.log(Y_2001), exog= sm.add_constant(X_2001_1))
result_3 = reg_3.fit()
print(result_3.summary())

# for calculting mincer returns - adding years of schooling 
X_2001_2 = X_2001[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

reg_4 = sm.OLS(endog= np.log(Y_2001), exog= sm.add_constant(X_2001_2))
result_4 = reg_4.fit()
print(result_4.summary())

(694005, 21)
(694005,)


  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.189
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                 4.053e+04
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:54:05   Log-Likelihood:            -1.0781e+06
No. Observations:              694005   AIC:                         2.156e+06
Df Residuals:                  694000   BIC:                         2.156e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.328
Model:                            OLS   Adj. R-squared:                  0.328
Method:                 Least Squares   F-statistic:                 6.786e+04
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:54:05   Log-Likelihood:            -1.0128e+06
No. Observations:              694005   AIC:                         2.026e+06
Df Residuals:                  693999   BIC:                         2.026e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [39]:
# year 2007
Y_2007 = Y.loc[Y.year == '2007']['inctot']
X_2007 = data_reg.loc[data_reg.year == '2007']
X_2007_1 = X_2007[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

print(X_2007.shape)
print(Y_2007.shape)

# regression of log wage on sector and sex 
reg_5 = sm.OLS(endog= np.log(Y_2007), exog= sm.add_constant(X_2007_1))
result_5 = reg_5.fit()
print(result_5.summary())

# for calculting mincer returns - adding years of schooling 
X_2007_2 = X_2007[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

reg_6 = sm.OLS(endog= np.log(Y_2007), exog= sm.add_constant(X_2007_2))
result_6 = reg_6.fit()
print(result_6.summary())

(191327, 21)
(191327,)
                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     2942.
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:55:00   Log-Likelihood:            -4.0519e+05
No. Observations:              191327   AIC:                         8.104e+05
Df Residuals:                  191322   BIC:                         8.104e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
c

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                 inctot   R-squared:                       0.133
Model:                            OLS   Adj. R-squared:                  0.133
Method:                 Least Squares   F-statistic:                     5885.
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:55:00   Log-Likelihood:            -3.9721e+05
No. Observations:              191327   AIC:                         7.944e+05
Df Residuals:                  191321   BIC:                         7.945e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [46]:
from statsmodels.iolib.summary2 import summary_col

results_table  = summary_col(results = [result_1, result_2, result_3, result_4, result_5, result_6],
                              float_format= '%0.2f',
                              model_names=['1996(unadj.)', '1996(adj.)',
                                           '2001(unadj.)', '2001(adj.)',
                                           '2007(unadj.)', '2007(adj.)'],
                              regressor_order=["const" ,"sex_female", "yrschool", 
                                                  "indgen_Industry", "indgen_Unskilled service",
                                                  "indgen_Skilled service"],
                              drop_omitted=True,
                              stars = True)
results_table.add_title(title = "South Africa : Mincer Return - Results from Earnings Regressions", results= None)
print(results_table)

                South Africa : Mincer Return - Results from Earnings Regressions
                         1996(unadj.) 1996(adj.) 2001(unadj.) 2001(adj.) 2007(unadj.) 2007(adj.)
------------------------------------------------------------------------------------------------
const                    8.77***      8.13***    8.96***      8.24***    10.07***     8.97***   
                         (0.01)       (0.01)     (0.00)       (0.00)     (0.01)       (0.02)    
sex_female               -0.45***     -0.46***   -0.39***     -0.40***   -0.35***     -0.39***  
                         (0.00)       (0.00)     (0.00)       (0.00)     (0.01)       (0.01)    
yrschool                              0.14***                 0.13***                 0.16***   
                                      (0.00)                  (0.00)                  (0.00)    
indgen_Industry          1.12***      0.70***    1.09***      0.72***    0.77***      0.42***   
                         (0.01)       (0.01)  

In [47]:
print(results_table.as_latex())

\begin{table}
\caption{South Africa : Mincer Return - Results from Earnings Regressions}
\label{}
\begin{center}
\begin{tabular}{lllllll}
\hline
                          & 1996(unadj.) & 1996(adj.) & 2001(unadj.) & 2001(adj.) & 2007(unadj.) & 2007(adj.)  \\
\hline
const                     & 8.77***      & 8.13***    & 8.96***      & 8.24***    & 10.07***     & 8.97***     \\
                          & (0.01)       & (0.01)     & (0.00)       & (0.00)     & (0.01)       & (0.02)      \\
sex\_female               & -0.45***     & -0.46***   & -0.39***     & -0.40***   & -0.35***     & -0.39***    \\
                          & (0.00)       & (0.00)     & (0.00)       & (0.00)     & (0.01)       & (0.01)      \\
yrschool                  &              & 0.14***    &              & 0.13***    &              & 0.16***     \\
                          &              & (0.00)     &              & (0.00)     &              & (0.00)      \\
indgen\_Industry          & 1.12***      & 0.70***