In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# reading dataset for india IPUMS
data_Vietnam = pd.read_stata('/users/quasar/downloads/ipumsi_00006.dta')
data_Vietnam.groupby('year').count()

Unnamed: 0_level_0,country,sample,serial,hhwt,pernum,perwt,age,sex,school,yrschool,educin,empstat,empstatd,indgen,classwk,classwkd,incwage
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1983,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077,533077
1987,576197,576197,576197,576197,576197,576197,576197,576197,576197,0,576197,576197,576197,576197,576197,576197,576197
1993,491484,491484,491484,491484,491484,491484,491484,491484,491484,0,491484,491484,491484,491484,491484,491484,491484
2004,533151,533151,533151,533151,533151,533151,533151,533151,533151,0,533151,533151,533151,533151,533151,533151,533151


## Sector in economy - Agriculture, Industry, Unskilled services, Skilled services 

In [4]:
data_india.head(50)

Unnamed: 0,COUNTRY,YEAR,SAMPLE,SERIAL,HHWT,PERNUM,PERWT,AGE,SEX,LIT,EDATTAIN,EDATTAIND,YRSCHOOL,EMPSTAT,EMPSTATD,LABFORCE,OCCISCO,INDGEN,CLASSWK,CLASSWKD,INCWAGE
0,356,1983,356198341,1000,1159.0,1,1159.0,32,1,2,3,311,11.0,1,110,2,3,100,2,200,155.0
1,356,1983,356198341,1000,1159.0,2,1159.0,28,2,2,2,221,10.0,3,310,1,99,0,0,0,0.0
2,356,1983,356198341,1000,1159.0,3,1159.0,6,2,2,1,120,1.0,3,330,9,99,0,0,0,0.0
3,356,1983,356198341,2000,1159.0,1,1159.0,40,1,1,1,110,0.0,1,110,2,6,10,1,100,0.0
4,356,1983,356198341,2000,1159.0,2,1159.0,34,2,1,1,110,0.0,3,310,1,99,0,0,0,0.0
5,356,1983,356198341,2000,1159.0,3,1159.0,9,2,2,2,211,5.0,3,330,9,99,0,0,0,0.0
6,356,1983,356198341,2000,1159.0,4,1159.0,7,1,2,1,120,3.0,3,330,9,99,0,0,0,0.0
7,356,1983,356198341,3000,1159.0,1,1159.0,86,2,1,1,110,1.0,1,110,2,7,30,1,100,0.0
8,356,1983,356198341,4000,1159.0,1,1159.0,50,1,1,1,110,0.0,1,110,2,9,10,2,206,35.0
9,356,1983,356198341,4000,1159.0,2,1159.0,46,2,1,1,110,0.0,3,310,1,99,0,0,0,0.0


In [5]:
data_india['indgen'].replace(to_replace=['agriculture, fishing, and forestry'], value='Agriculture', inplace=True)
data_india['indgen'].replace(to_replace=['construction', 'manufacturing' ,'mining and extraction'], value='Industry', inplace=True)
data_india['indgen'].replace(to_replace=['hotels and restaurants','private household services',
                    'transportation, storage, and communications', 'wholesale and retail trade'], 
                    value='Unskilled service', inplace=True)
data_india['indgen'].replace(to_replace=['education', 'financial services and insurance','health and social work',
                    'public administration and defense',
                    'other services', 'business services and real estate', 
                    'electricity, gas, water and waste management' ], 
                    value='Skilled service', inplace=True)

indexNames = data_india[data_india['indgen'].isin(['unknown', 'niu (not in universe)'])].index
data_india = data_india.drop(indexNames)
data_india['indgen'].replace(to_replace=['niu (not in universe)', 'unknown'], value='Agriculture', inplace=True)
print("\nSector by labour share % ")
share = data_india.groupby('year')['indgen'].value_counts(normalize=True)
share

KeyError: 'indgen'

## Mean of incwage (Wage) by year and sector

In [5]:
print("\nMean of incwage by year and sector\n")
print(data_india.groupby(['year','indgen', 'educin'])['incwage'].mean())


Mean of incwage by year and sector

year  indgen             educin                       
1983  Agriculture        illiterate, less than primary   3,552.44
                         literate, less than primary     1,350.23
                         primary                         1,348.10
                         middle                              9.02
                         secondary                       2,796.10
                         higher secondary                     NaN
                         undergraduate or graduate          15.32
                         unknown                            11.36
      Industry           illiterate, less than primary   6,953.51
                         literate, less than primary     5,805.33
                         primary                         1,603.22
                         middle                          2,260.04
                         secondary                       3,054.23
                         higher secondary         

## Median of incwage by year and sector 

In [6]:
print("\nMean of incwage by year and sector\n")
print(data_india.groupby(['year','indgen', 'classwk'])['incwage'].mean())


Mean of incwage by year and sector

year  indgen             classwk              
1983  Agriculture        niu (not in universe)    3,789.27
                         self-employed            1,354.08
                         wage/salary worker       2,768.13
                         unpaid worker            5,193.31
                         other                       51.69
                         unknown/missing               NaN
      Industry           niu (not in universe)   10,025.66
                         self-employed            1,891.12
                         wage/salary worker       6,559.96
                         unpaid worker                3.78
                         other                       31.53
                         unknown/missing               NaN
      Unskilled service  niu (not in universe)       23.42
                         self-employed            1,430.28
                         wage/salary worker       1,881.04
                         unpaid

In [7]:
print("\nMedian of incwage by year and sector\n")
print(data_india.groupby(['year','indgen', 'educin'])['incwage'].median())


Median of incwage by year and sector

year  indgen             educin                       
1983  Agriculture        illiterate, less than primary       0.00
                         literate, less than primary         0.00
                         primary                             0.00
                         middle                              0.00
                         secondary                           0.00
                         higher secondary                     NaN
                         undergraduate or graduate           0.00
                         unknown                             0.00
      Industry           illiterate, less than primary      20.00
                         literate, less than primary        24.00
                         primary                            28.00
                         middle                             40.00
                         secondary                          84.00
                         higher secondary       

In [8]:
"""
import pandas as pd

data_india = pd.read_csv('/users/quasar/downloads/ipumsi_00009.csv')
data_india.head()
"""

"\nimport pandas as pd\n\ndata_india = pd.read_csv('/users/quasar/downloads/ipumsi_00009.csv')\ndata_india.head()\n"

In [9]:
data_india['educin'].unique()

['secondary', 'illiterate, less than primary', 'literate, less than primary', 'primary', 'middle', 'undergraduate or graduate', 'unknown', 'higher secondary']
Categories (8, object): ['illiterate, less than primary' < 'literate, less than primary' < 'primary' < 'middle' < 'secondary' < 'higher secondary' < 'undergraduate or graduate' < 'unknown']

In [10]:
data_india['yrschool'].unique()

['11 years', 'none or pre-school', '4 years', '5 years', '3 years', ..., '18 years or more', '14 years', '13 years', 'unknown/missing', NaN]
Length: 21
Categories (20, object): ['none or pre-school' < '1 year' < '2 years' < '3 years' ... '16 years' < '17 years' < '18 years or more' < 'unknown/missing']

In [11]:
data_india['indgen'].unique()

['Skilled service', 'Agriculture', 'Industry', 'Unskilled service']
Categories (4, object): ['Agriculture' < 'Industry' < 'Unskilled service' < 'Skilled service']

In [12]:
data_india.shape

(874745, 18)

In [13]:
# calculting mincer returns 
# use sector dummy and sex as control 

data_reg = pd.get_dummies(data_india, columns=['indgen', 'sex'], drop_first=True)
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 874745 entries, 0 to 2133899
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   country                   874745 non-null  category
 1   year                      874745 non-null  category
 2   sample                    874745 non-null  category
 3   serial                    874745 non-null  int32   
 4   hhwt                      874745 non-null  float64 
 5   pernum                    874745 non-null  int8    
 6   perwt                     874745 non-null  float64 
 7   age                       874745 non-null  category
 8   school                    874745 non-null  category
 9   yrschool                  214414 non-null  category
 10  educin                    874745 non-null  category
 11  empstat                   874745 non-null  category
 12  empstatd                  874745 non-null  category
 13  classwk                   87

In [14]:
# transforming variables in educin 
data_reg['educin'].replace(to_replace=['illiterate, less than primary'], value = 0 , inplace=True)
data_reg['educin'].replace(to_replace=['literate, less than primary'], value = 2 , inplace=True)
data_reg['educin'].replace(to_replace=['primary'], value = 5 , inplace=True)
data_reg['educin'].replace(to_replace=['middle'], value = 8 , inplace=True)
data_reg['educin'].replace(to_replace=['secondary'], value = 10 , inplace=True)
data_reg['educin'].replace(to_replace=['higher secondary'], value = 12 , inplace=True)
data_reg['educin'].replace(to_replace=['undergraduate or graduate'], value = 15 , inplace=True)
data_reg['educin'].replace(to_replace=['unknown'], value = 99 , inplace=True)

In [15]:
data_reg['year'].unique()

['1983', '1987', '1993', '2004']
Categories (4, object): ['1983' < '1987' < '1993' < '2004']

In [16]:
# Weekly income 
data_reg = data_reg.loc[data_reg.educin != 99]
data_reg =  data_reg[data_reg['incwage'] != 0]

Y = data_reg[['year', 'incwage']]
X = data_reg[['year', 'sex_female', 'sex_unknown','educin', 'indgen_Industry', 
                    'indgen_Unskilled service', 'indgen_Skilled service']]

# year 1987
Y_1987 = Y.loc[Y.year == '1987']['incwage']
X_1987 = data_reg.loc[data_reg.year == '1987']
X_1987_1 = X_1987[['sex_female',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

In [17]:
print(X_1987_1.shape)
print(Y_1987.shape)

(52172, 4)
(52172,)


In [18]:
# regression of log wage on sector and sex 
reg_1 = sm.OLS(endog= np.log(Y_1987), exog= sm.add_constant(X_1987_1))
result_1 = reg_1.fit()
print(result_1.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     7355.
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:47:45   Log-Likelihood:                -67314.
No. Observations:               52172   AIC:                         1.346e+05
Df Residuals:                   52167   BIC:                         1.347e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


In [20]:
# for calculting mincer returns - adding years of schooling 
X_1987_2 = X_1987[['sex_female', 'educin',
                     'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]

reg_2 = sm.OLS(endog= np.log(Y_1987), exog= sm.add_constant(X_1987_2))
result_2 = reg_2.fit()
print(result_2.summary())
result_2

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.480
Model:                            OLS   Adj. R-squared:                  0.480
Method:                 Least Squares   F-statistic:                     9622.
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:24   Log-Likelihood:                -61933.
No. Observations:               52172   AIC:                         1.239e+05
Df Residuals:                   52166   BIC:                         1.239e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11ee6dd60>

In [21]:
Y_1993 = Y.loc[Y.year == '1993']['incwage']
X_1993 = data_reg.loc[data_reg.year == '1993']
X_1993_1 = X_1993[['sex_female', 'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
X_1993_2 = X_1993[['sex_female', 'educin','indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]  

reg_3 = sm.OLS(endog= np.log(Y_1993), exog= sm.add_constant(X_1993_1))
result_3 = reg_3.fit()
print(result_3.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.322
Model:                            OLS   Adj. R-squared:                  0.322
Method:                 Least Squares   F-statistic:                 1.017e+04
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:28   Log-Likelihood:            -1.1354e+05
No. Observations:               85671   AIC:                         2.271e+05
Df Residuals:                   85666   BIC:                         2.271e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


In [22]:
reg_4 = sm.OLS(endog= np.log(Y_1993), exog= sm.add_constant(X_1993_2))
result_4 = reg_4.fit()
print(result_4.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.401
Model:                            OLS   Adj. R-squared:                  0.401
Method:                 Least Squares   F-statistic:                 1.146e+04
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:31   Log-Likelihood:            -1.0825e+05
No. Observations:               85671   AIC:                         2.165e+05
Df Residuals:                   85665   BIC:                         2.166e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


In [23]:
Y_2004 = Y.loc[Y.year == '2004']['incwage']
X_2004 = data_reg.loc[data_reg.year == '2004']
X_2004_1 = X_2004[['sex_female', 'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
X_2004_2 = X_2004[['sex_female', 'educin','indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']] 

reg_5 = sm.OLS(endog= np.log(Y_2004), exog= sm.add_constant(X_2004_1))
result_5 = reg_5.fit()
print(result_5.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.436
Model:                            OLS   Adj. R-squared:                  0.436
Method:                 Least Squares   F-statistic:                 1.680e+04
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:34   Log-Likelihood:            -1.0109e+05
No. Observations:               86809   AIC:                         2.022e+05
Df Residuals:                   86804   BIC:                         2.022e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


In [24]:
reg_6 = sm.OLS(endog= np.log(Y_2004), exog= sm.add_constant(X_2004_2))
result_6 = reg_6.fit()
print(result_6.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.515
Model:                            OLS   Adj. R-squared:                  0.515
Method:                 Least Squares   F-statistic:                 1.844e+04
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:37   Log-Likelihood:                -94568.
No. Observations:               86809   AIC:                         1.891e+05
Df Residuals:                   86803   BIC:                         1.892e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)


In [25]:
from statsmodels.iolib.summary2 import summary_col

results_table  = summary_col(results = [result_1, result_2, result_3, result_4, result_5, result_6],
                              float_format= '%0.2f',
                              model_names=['1987(unadj.)', '1987(adj.)',
                                           '1993(unadj.)', '1993(adj.)',
                                           '2004(unadj.)', '2004(adj.)'],
                              regressor_order=["const" ,"sex_female", "educin", 
                                                  "indgen_Industry", "indgen_Unskilled service",
                                                  "indgen_Skilled service"],
                              drop_omitted=True,
                              stars = True)
#results_table.add_title(title = "TABLE 2: Results from Earnings Regressions", results= None)
print(results_table)


                         1987(unadj.) 1987(adj.) 1993(unadj.) 1993(adj.) 2004(unadj.) 2004(adj.)
------------------------------------------------------------------------------------------------
const                    4.25***      4.04***    4.72***      4.56***    5.57***      5.36***   
                         (0.01)       (0.01)     (0.01)       (0.01)     (0.01)       (0.01)    
sex_female               -0.71***     -0.53***   -0.53***     -0.41***   -0.60***     -0.48***  
                         (0.01)       (0.01)     (0.01)       (0.01)     (0.01)       (0.01)    
educin                                0.09***                 0.08***                 0.07***   
                                      (0.00)                  (0.00)                  (0.00)    
indgen_Industry          0.67***      0.41***    0.72***      0.48***    0.60***      0.44***   
                         (0.01)       (0.01)     (0.01)       (0.01)     (0.01)       (0.01)    
indgen_Unskilled service 0.68

In [29]:
print(results_table.as_latex())

\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{lllllll}
\hline
                          & 1987(unadj.) & 1987(adj.) & 1993(unadj.) & 1993(adj.) & 2004(unadj.) & 2004(adj.)  \\
\hline
const                     & 4.25***      & 4.04***    & 4.72***      & 4.56***    & 5.57***      & 5.36***     \\
                          & (0.01)       & (0.01)     & (0.01)       & (0.01)     & (0.01)       & (0.01)      \\
sex\_female               & -0.71***     & -0.53***   & -0.53***     & -0.41***   & -0.60***     & -0.48***    \\
                          & (0.01)       & (0.01)     & (0.01)       & (0.01)     & (0.01)       & (0.01)      \\
educin                    &              & 0.09***    &              & 0.08***    &              & 0.07***     \\
                          &              & (0.00)     &              & (0.00)     &              & (0.00)      \\
indgen\_Industry          & 0.67***      & 0.41***    & 0.72***      & 0.48***    & 0.60***      & 0.44***     \\


In [26]:
Y_2004 = Y.loc[Y.year == '2004']['incwage']
X_2004 = data_reg.loc[data_reg.year == '2004']
# X_2004_1 = X_2004[['sex_female', 'indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']]
X_2004_2 = X_2004[['age', 'sex_female', 'educin','indgen_Industry', 'indgen_Unskilled service', 'indgen_Skilled service']] 
X_2004_2['age'] = X_2004_2['age'].astype(int)
X_2004_2['age2'] = X_2004_2['age']**2
X_2004_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_2004_2['age'] = X_2004_2['age'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_2004_2['age2'] = X_2004_2['age']**2


Unnamed: 0,age,sex_female,educin,indgen_Industry,indgen_Unskilled service,indgen_Skilled service,age2
1600775,45,0,0,0,0,0,2025
1600805,20,0,0,0,0,0,400
1600806,15,0,2,0,0,0,225
1600828,20,0,5,0,0,0,400
1600830,45,0,0,0,0,0,2025
...,...,...,...,...,...,...,...
2133876,37,0,10,0,0,1,1369
2133885,43,1,0,0,0,0,1849
2133887,40,0,15,0,0,1,1600
2133890,42,0,8,0,1,0,1764


In [27]:
reg_7 = sm.OLS(endog= np.log(Y_2004), exog= sm.add_constant(X_2004_2))
result_7 = reg_7.fit()
print(result_7.summary())

                            OLS Regression Results                            
Dep. Variable:                incwage   R-squared:                       0.564
Model:                            OLS   Adj. R-squared:                  0.564
Method:                 Least Squares   F-statistic:                 1.601e+04
Date:                Sat, 18 Dec 2021   Prob (F-statistic):               0.00
Time:                        15:48:49   Log-Likelihood:                -89992.
No. Observations:               86809   AIC:                         1.800e+05
Df Residuals:                   86801   BIC:                         1.801e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

  x = pd.concat(x[::order], 1)
