In [40]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [41]:
# reading dataset for india IPUMS
data_venezuela = pd.read_stata('/users/quasar/downloads/ipumsi_00020.dta')
data_venezuela.groupby('year').count()

Unnamed: 0_level_0,country,sample,serial,hhwt,pernum,perwt,age,sex,school,edattain,edattaind,yrschool,empstat,empstatd,indgen,classwk,classwkd,hrswork1,hrswork2,incearn
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1971,955389,955389,955389,955389,955389,955389,955389,955389,955389,955389,955389,955389,955389,955389,0,955389,955389,0,955389,955389
1981,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536,1204536
1990,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,1518664,0,0,1518664
2001,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821,1991821


## Sector in economy - Agriculture, Industry, Unskilled services, Skilled services 

In [42]:
data_venezuela.head()

Unnamed: 0,country,year,sample,serial,hhwt,pernum,perwt,age,sex,school,edattain,edattaind,yrschool,empstat,empstatd,indgen,classwk,classwkd,hrswork1,hrswork2,incearn
0,venezuela,1971,venezuela 1971,1000,10,1,8,36,male,"no, not specified",unknown,unknown/missing,unknown/missing,employed,at work,,wage/salary worker,"wage/salary worker, private employer",,"40-48 hours (except il1972, tt1980, tt1990, an...",1750
1,venezuela,1971,venezuela 1971,1000,10,2,8,35,female,"no, not specified",unknown,unknown/missing,unknown/missing,inactive,housework,,niu (not in universe),niu (not in universe),,niu (not in universe),99999999
2,venezuela,1971,venezuela 1971,1000,10,3,8,12,female,yes,less than primary completed,some primary completed,5 years,niu (not in universe),niu (not in universe),,niu (not in universe),niu (not in universe),,niu (not in universe),99999999
3,venezuela,1971,venezuela 1971,1000,10,4,10,11,female,yes,less than primary completed,some primary completed,5 years,niu (not in universe),niu (not in universe),,niu (not in universe),niu (not in universe),,niu (not in universe),99999999
4,venezuela,1971,venezuela 1971,1000,10,5,10,9,female,yes,less than primary completed,some primary completed,3 years,niu (not in universe),niu (not in universe),,niu (not in universe),niu (not in universe),,niu (not in universe),99999999


In [43]:
data_venezuela['indgen'].replace(to_replace=['agriculture, fishing, and forestry'], value='Agriculture', inplace=True)
data_venezuela['indgen'].replace(to_replace=['construction', 'manufacturing' ,'mining and extraction'], value='Industry', inplace=True)
data_venezuela['indgen'].replace(to_replace=['hotels and restaurants','private household services',
                    'transportation, storage, and communications', 'wholesale and retail trade'], 
                    value='Unskilled service', inplace=True)
data_venezuela['indgen'].replace(to_replace=['education', 'financial services and insurance','health and social work',
                    'public administration and defense',
                    'other services', 'business services and real estate', 
                    'electricity, gas, water and waste management' ], 
                    value='Skilled service', inplace=True)

indexNames = data_venezuela[data_venezuela['indgen'].isin(['unknown', 'niu (not in universe)'])].index
data_venezuela = data_venezuela.drop(indexNames)
data_venezuela['indgen'].replace(to_replace=['niu (not in universe)', 'unknown'], value='Agriculture', inplace=True)
print("\nSector by labour share % ")
share = data_venezuela.groupby('year')['indgen'].value_counts(normalize=True)
share


Sector by labour share % 


year                   
1971  Agriculture          NaN
      Industry             NaN
      Unskilled service    NaN
      Skilled service      NaN
1981  Skilled service     0.31
      Unskilled service   0.30
      Industry            0.27
      Agriculture         0.13
1990  Agriculture         0.29
      Skilled service     0.27
      Unskilled service   0.24
      Industry            0.19
2001  Unskilled service   0.40
      Skilled service     0.29
      Industry            0.20
      Agriculture         0.10
Name: indgen, dtype: float64

## Mean of incwage (Wage) by year and sector

In [44]:
data_venezuela.yrschool.unique()

['unknown/missing', '5 years', '3 years', '4 years', 'none or pre-school', ..., '18 years or more', '12 years', 'some primary', 'some tertiary', 'special education']
Length: 24
Categories (24, object): ['none or pre-school' < '1 year' < '2 years' < '3 years' ... 'some secondary' < 'some tertiary' < 'special education' < 'unknown/missing']

In [45]:
print("\nMean of incwage by year and sector\n")
print(data_venezuela.groupby(['year','indgen', 'edattain'])['incearn'].mean())


Mean of incwage by year and sector

year  indgen             edattain                   
1971  Agriculture        less than primary completed             NaN
                         primary completed                       NaN
                         secondary completed                     NaN
                         university completed                    NaN
                         unknown                                 NaN
      Industry           less than primary completed             NaN
                         primary completed                       NaN
                         secondary completed                     NaN
                         university completed                    NaN
                         unknown                                 NaN
      Unskilled service  less than primary completed             NaN
                         primary completed                       NaN
                         secondary completed                     NaN
             

## Median of incwage by year and sector 

In [46]:
print("\nMean of incwage by year and sector\n")
print(data_venezuela.groupby(['year','indgen', 'classwk'])['incearn'].mean())


Mean of incwage by year and sector

year  indgen             classwk              
1971  Agriculture        niu (not in universe)             NaN
                         self-employed                     NaN
                         wage/salary worker                NaN
                         unpaid worker                     NaN
                         unknown/missing                   NaN
      Industry           niu (not in universe)             NaN
                         self-employed                     NaN
                         wage/salary worker                NaN
                         unpaid worker                     NaN
                         unknown/missing                   NaN
      Unskilled service  niu (not in universe)             NaN
                         self-employed                     NaN
                         wage/salary worker                NaN
                         unpaid worker                     NaN
                         unknown/m

In [57]:
print("\nMedian of incwage by year and sector\n")
print(data_venezuela.groupby(['year','indgen', 'edattain'])['incearn'].median())


Median of incwage by year and sector

year  indgen             edattain                   
1971  Agriculture        less than primary completed         NaN
                         primary completed                   NaN
                         secondary completed                 NaN
                         university completed                NaN
                         unknown                             NaN
      Industry           less than primary completed         NaN
                         primary completed                   NaN
                         secondary completed                 NaN
                         university completed                NaN
                         unknown                             NaN
      Unskilled service  less than primary completed         NaN
                         primary completed                   NaN
                         secondary completed                 NaN
                         university completed                Na

In [58]:
data_venezuela.shape

(2642772, 21)

In [59]:
data_venezuela['yrschool'].replace(to_replace= ['none or pre-school'], value = 0, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['1 year'], value = 1, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['2 years'], value = 2, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['3 years'], value = 3 , inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['4 years'], value = 4, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['5 years'], value = 5, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['6 years'], value = 6 , inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['7 years'], value = 7, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['8 years'], value = 8, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['9 years'], value = 9 , inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['10 years'], value = 10, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['11 years'], value = 11, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['12 years'], value = 12, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['13 years'], value = 13, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['14 years'], value = 14, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['15 years'], value = 15, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['16 years'], value = 16, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['17 years'], value = 17, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['18 years or more'], value = 18 , inplace=True)

data_venezuela['yrschool'].replace(to_replace= ['not specified'], value = 90, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['some primary'], value = 91, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['some technical after primary'], value = 92, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['some secondary'], value = 93, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['some tertiary'], value = 94, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['adult literacy'], value = 95, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['special education'], value = 96, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['unknown/missing'], value = 98, inplace=True)
data_venezuela['yrschool'].replace(to_replace= ['niu (not in universe)'], value = 99, inplace=True)

In [60]:
# calculting mincer returns 
# use sector dummy and sex as control 

data_reg = pd.get_dummies(data_venezuela, columns=['indgen', 'sex'], drop_first=True)
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2642772 entries, 0 to 5670409
Data columns (total 23 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   country                   category
 1   year                      category
 2   sample                    category
 3   serial                    int32   
 4   hhwt                      int16   
 5   pernum                    int8    
 6   perwt                     int16   
 7   age                       category
 8   school                    category
 9   edattain                  category
 10  edattaind                 category
 11  yrschool                  category
 12  empstat                   category
 13  empstatd                  category
 14  classwk                   category
 15  classwkd                  category
 16  hrswork1                  category
 17  hrswork2                  category
 18  incearn                   int32   
 19  indgen_Industry           uint8   
 20  in

In [61]:
data_venezuela.edattain.unique()

['unknown', 'less than primary completed', 'primary completed', 'university completed', 'secondary completed']
Categories (5, object): ['less than primary completed' < 'primary completed' < 'secondary completed' < 'university completed' < 'unknown']

In [62]:
# transforming variables in educin 

# data_reg['edattain'].replace(to_replace=['illiterate, less than primary'], value = 0 , inplace=True)
data_reg['edattain'].replace(to_replace=['less than primary completed'], value = 2 , inplace=True)
data_reg['edattain'].replace(to_replace=['primary completed'], value = 5 , inplace=True)
# data_reg['edattain'].replace(to_replace=['middle'], value = 8 , inplace=True)
data_reg['edattain'].replace(to_replace=['secondary completed'], value = 10 , inplace=True)
# data_reg['edattain'].replace(to_replace=['higher secondary'], value = 12 , inplace=True)
data_reg['edattain'].replace(to_replace=['university completed'], value = 15 , inplace=True)
data_reg['edattain'].replace(to_replace=['unknown'], value = 99 , inplace=True)

In [63]:
data_venezuela.yrschool.unique()

[98, 5, 3, 4, 0, ..., 18, 12, 91, 94, 96]
Length: 24
Categories (24, int64): [0 < 1 < 2 < 3 ... 93 < 94 < 96 < 98]

In [64]:
data_reg = data_reg.loc[data_reg.yrschool.isin([4,  0,  2,  3,  1,  6,  5, 12, 11,  8,  9, 15,  7, 13, 10,
       14, 17, 18, 16])]

In [68]:
# income 
data_reg =  data_reg[data_reg['incearn'] != 0]

Y = data_reg[['year', 'incearn']]
X = data_reg[['year', 'sex_female','yrschool', 'indgen_Industry', 
                    'indgen_Unskilled service', 'indgen_Skilled service']]

(372956, 4)
(372956,)


In [69]:
# year 1981
Y_1981 = Y.loc[Y.year == '1981']['incearn']
X_1981 = data_reg.loc[data_reg.year == '1981']
X_1981_1 = X_1981[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

print(X_1981_1.shape)
print(Y_1981.shape)

# regression of log wage on sector and sex 
reg_1 = sm.OLS(endog= np.log(Y_1981), exog= sm.add_constant(X_1981_1))
result_1 = reg_1.fit()
print(result_1.summary())

# for calculting mincer returns - adding years of schooling 
X_1981_2 = X_1981[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
                     
reg_2 = sm.OLS(endog= np.log(Y_1981), exog= sm.add_constant(X_1981_2))
result_2 = reg_2.fit()
print(result_2.summary())
result_2

(372956, 4)
(372956,)


  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                incearn   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     865.7
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        19:32:06   Log-Likelihood:            -9.0396e+05
No. Observations:              372956   AIC:                         1.808e+06
Df Residuals:                  372951   BIC:                         1.808e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                incearn   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     1328.
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        19:32:07   Log-Likelihood:            -9.0239e+05
No. Observations:              372956   AIC:                         1.805e+06
Df Residuals:                  372950   BIC:                         1.805e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x125dfc100>

In [70]:
# year 1990
Y_1990 = Y.loc[Y.year == '1990']['incearn']
X_1990 = data_reg.loc[data_reg.year == '1990']
X_1990_1 = X_1990[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

# regression of log wage on sector and sex 
reg_3 = sm.OLS(endog= np.log(Y_1990), exog= sm.add_constant(X_1990_1))
result_3 = reg_1.fit()
print(result_3.summary())

# for calculting mincer returns - adding years of schooling 
X_1990_2 = X_1990[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
                     
reg_4 = sm.OLS(endog= np.log(Y_1990), exog= sm.add_constant(X_1990_2))
result_4 = reg_4.fit()
print(result_4.summary())
result_4

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                incearn   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     865.7
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        19:32:19   Log-Likelihood:            -9.0396e+05
No. Observations:              372956   AIC:                         1.808e+06
Df Residuals:                  372951   BIC:                         1.808e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x125e76970>

In [71]:
# year 2001
Y_2001 = Y.loc[Y.year == '2001']['incearn']
X_2001 = data_reg.loc[data_reg.year == '2001']
X_2001_1 = X_2001[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

# regression of log wage on sector and sex 
reg_5 = sm.OLS(endog= np.log(Y_2001), exog= sm.add_constant(X_2001_1))
result_5 = reg_1.fit()
print(result_5.summary())

# for calculting mincer returns - adding years of schooling 
X_2001_2 = X_2001[['sex_female', 'yrschool',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
                     
reg_6 = sm.OLS(endog= np.log(Y_2001), exog= sm.add_constant(X_2001_2))
result_6 = reg_6.fit()
print(result_6.summary())

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                incearn   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     865.7
Date:                Sun, 19 Dec 2021   Prob (F-statistic):               0.00
Time:                        19:32:48   Log-Likelihood:            -9.0396e+05
No. Observations:              372956   AIC:                         1.808e+06
Df Residuals:                  372951   BIC:                         1.808e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [79]:
from statsmodels.iolib.summary2 import summary_col

results_table  = summary_col(results = [result_1, result_2, result_3, result_4, result_5, result_6],
                              float_format= '%0.2f',
                              model_names=['1981(unadj.)', '1981(adj.)',
                                           '1990(unadj.)', '1990(adj.)',
                                           '2001(unadj.)', '2001(adj.)'],
                              regressor_order=["const" ,"sex_female", "yrschool", 
                                                  "indgen_Industry", "indgen_Unskilled service",
                                                  "indgen_Skilled service"],
                              drop_omitted=True,
                              stars = True)
results_table.add_title(title = "Venezuela : Mincer Return - Results from Earnings Regressions", results= None)
print(results_table)

                 Venezuela : Mincer Return - Results from Earnings Regressions
                         1981(unadj.) 1981(adj.) 1990(unadj.) 1990(adj.) 2001(unadj.) 2001(adj.)
------------------------------------------------------------------------------------------------
const                    8.88***      8.65***    8.88***      8.09***    8.88***      6.87***   
                         (0.01)       (0.01)     (0.01)       (0.00)     (0.01)       (0.02)    
sex_female               -0.23***     -0.25***   -0.23***     -0.27***   -0.23***     -0.52***  
                         (0.01)       (0.01)     (0.01)       (0.01)     (0.01)       (0.01)    
yrschool                              0.07***                 0.07***                 0.10***   
                                      (0.00)                  (0.00)                  (0.00)    
indgen_Industry          -0.82***     -1.05***   -0.82***     0.47***    -0.82***     -0.94***  
                         (0.02)       (0.02)    

In [80]:
print(results_table.as_latex())

\begin{table}
\caption{Venezuela : Mincer Return - Results from Earnings Regressions}
\label{}
\begin{center}
\begin{tabular}{lllllll}
\hline
                          & 1981(unadj.) & 1981(adj.) & 1990(unadj.) & 1990(adj.) & 2001(unadj.) & 2001(adj.)  \\
\hline
const                     & 8.88***      & 8.65***    & 8.88***      & 8.09***    & 8.88***      & 6.87***     \\
                          & (0.01)       & (0.01)     & (0.01)       & (0.00)     & (0.01)       & (0.02)      \\
sex\_female               & -0.23***     & -0.25***   & -0.23***     & -0.27***   & -0.23***     & -0.52***    \\
                          & (0.01)       & (0.01)     & (0.01)       & (0.01)     & (0.01)       & (0.01)      \\
yrschool                  &              & 0.07***    &              & 0.07***    &              & 0.10***     \\
                          &              & (0.00)     &              & (0.00)     &              & (0.00)      \\
indgen\_Industry          & -0.82***     & -1.05***  