In [8]:
import pandas as pd
import statsmodels.api as sm
import numpy as np


In [19]:
# Lecture des données
# Lien pour données: 
# https://drive.google.com/file/d/15WsLH_wY-tQS1YAf-msWTmsBl77qUMrt/view?usp=sharing
eemploi2012df = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/Enquete_Emploi_2012/eemploi2012.csv', low_memory=False) 
eemploi2012df.shape

(422133, 549)

In [26]:
# On supprime les observations/lignes avec valeurs manquantes

df = eemploi2012df.loc[:, ['SALRED', 'DDIPL', 'SEXE', 'NBHEUR', 'AG', 'CSER', 'AG5', 
'FORDAT', 'REG', 'MATRI', 'ANCENTR4', 'NAFG4N']]
print(pd.isna(df).sum())
df = df.dropna()
print(df.shape)

SALRED      362039
DDIPL         1041
SEXE             0
NBHEUR      376350
AG               0
CSER        195082
AG5              0
FORDAT       50232
REG              0
MATRI            3
ANCENTR4    223897
NAFG4N      217055
dtype: int64
(38988, 12)


In [28]:
df['lsal'] = np.log(df['SALRED']/df['NBHEUR'])
df['afe'] = df['FORDAT'] - (2012 - df['AG'])
df['exp'] = df['AG'] - df['afe']
dummies = pd.get_dummies(data = df[['DDIPL', 'SEXE']], columns=['DDIPL', 'SEXE'])
df = pd.concat([df, dummies], axis = 1)
df.head()

Unnamed: 0,SALRED,DDIPL,SEXE,NBHEUR,AG,CSER,AG5,FORDAT,REG,MATRI,...,afe,exp,DDIPL_1.0,DDIPL_3.0,DDIPL_4.0,DDIPL_5.0,DDIPL_6.0,DDIPL_7.0,SEXE_1,SEXE_2
3,1379.0,7.0,2,151.0,36,5.0,30,1992.0,25,1.0,...,16.0,20.0,0,0,0,0,0,1,0,1
14,2258.0,6.0,1,140.0,53,4.0,50,1976.0,25,2.0,...,17.0,36.0,0,0,0,0,1,0,1,0
32,2472.0,7.0,1,151.0,45,6.0,40,1983.0,25,1.0,...,16.0,29.0,0,0,0,0,0,1,1,0
55,988.0,6.0,2,104.0,34,4.0,30,2000.0,31,2.0,...,22.0,12.0,0,0,0,0,1,0,0,1
58,1697.0,4.0,1,173.0,37,6.0,30,1995.0,31,2.0,...,20.0,17.0,0,0,1,0,0,0,1,0


In [29]:
import patsy
formula = """
lsal ~ SEXE_2
       + (C(DDIPL) + C(ANCENTR4) + C(NAFG4N) + C(REG) + C(CSER) + C(MATRI))**2 
       + exp + I(exp**2) + I(exp**3) + I(exp**4)      
"""
Y, X = patsy.dmatrices(formula, df, return_type = "dataframe")

In [30]:

linreg_ols1 = sm.OLS(endog = Y, exog = X, missing = 'drop').fit(cov_type='HC0')
print(linreg_ols1.summary())


                            OLS Regression Results                            
Dep. Variable:                   lsal   R-squared:                       0.425
Model:                            OLS   Adj. R-squared:                  0.417
Method:                 Least Squares   F-statistic:                     360.1
Date:                Tue, 08 Mar 2022   Prob (F-statistic):               0.00
Time:                        11:27:54   Log-Likelihood:                -10580.
No. Observations:               38988   AIC:                         2.226e+04
Df Residuals:                   38440   BIC:                         2.695e+04
Df Model:                         547                                         
Covariance Type:                  HC0                                         
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Inte



In [31]:
# Application FVL

W = X.drop(columns = ['SEXE_2'])
D = df['SEXE_2']
Y = df['lsal']


In [32]:
# 1: 
# Opérateur FVL
Dreg = sm.OLS(endog = D, exog = W, missing = 'drop').fit(cov_type='HC0')
Dhat = Dreg.predict()
Dres = D - Dhat
Yreg = sm.OLS(endog = Y, exog = W, missing = 'drop').fit(cov_type='HC0')
Yhat = Yreg.predict()
Yres = Y - Yhat

# 2: 
Y_partialReg = sm.OLS(endog = Yres, exog = Dres, missing = 'drop').fit(cov_type='HC0')
print(Y_partialReg.summary())

                                 OLS Regression Results                                
Dep. Variable:                   lsal   R-squared (uncentered):                   0.020
Model:                            OLS   Adj. R-squared (uncentered):              0.019
Method:                 Least Squares   F-statistic:                              741.6
Date:                Tue, 08 Mar 2022   Prob (F-statistic):                   8.68e-162
Time:                        11:29:47   Log-Likelihood:                         -10580.
No. Observations:               38988   AIC:                                  2.116e+04
Df Residuals:                   38987   BIC:                                  2.117e+04
Df Model:                           1                                                  
Covariance Type:                  HC0                                                  
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------