# Step 0 : We import the necessary modules and files 

In [39]:
import numpy as np
from scipy.stats import entropy
import pandas as pd
import math
import matplotlib.pyplot as plt
import wooldridge as woo
import statsmodels.formula.api as smf
import linearmodels.iv as iv

In [22]:
dataFF_red_norm01 = pd.read_csv('dataFF_red_norm01.csv')
dataFF_red_norm_gauss = pd.read_csv('dataFF_red_norm_gauss.csv')

# Step 1 : We run naive regressions

In [71]:
reg = smf.ols(formula='sum~np.log(Nb_doct_30km)+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit()
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

latex_table = table.to_latex(index=True, header=True, column_format='lcccccc', float_format="%.4f")

print(latex_table)

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                    sum   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     23.62
Date:                Wed, 06 Nov 2024   Prob (F-statistic):           4.43e-56
Time:                        16:52:56   Log-Likelihood:                -19096.
No. Observations:                5294   AIC:                         3.822e+04
Df Residuals:                    5280   BIC:                         3.831e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [72]:
reg = smf.ols(formula='sum~np.log(Nb_doct_D_30km)+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit()
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

latex_table = table.to_latex(index=True, header=True, column_format='lcccccc', float_format="%.4f")

print(latex_table)

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                    sum   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.051
Method:                 Least Squares   F-statistic:                     23.09
Date:                Wed, 06 Nov 2024   Prob (F-statistic):           9.94e-55
Time:                        16:53:03   Log-Likelihood:                -19100.
No. Observations:                5294   AIC:                         3.823e+04
Df Residuals:                    5280   BIC:                         3.832e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

# Step 2 : We run regressions to find an instrument

In [96]:
reg = smf.ols(formula='np.log(Nb_doct_30km)~share_F+APL+np.log(Standardized_population)+gender+Fibre+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table1 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table1: \n{table1}\n')

results.summary(): 
                             OLS Regression Results                             
Dep. Variable:     np.log(Nb_doct_30km)   R-squared:                       0.861
Model:                              OLS   Adj. R-squared:                  0.861
Method:                   Least Squares   F-statistic:                     550.0
Date:                  Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                          09:53:22   Log-Likelihood:                -4568.8
No. Observations:                  5294   AIC:                             9166.
Df Residuals:                      5280   BIC:                             9258.
Df Model:                            13                                         
Covariance Type:                cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

In [95]:
reg = smf.ols(formula='np.log(Nb_doct_30km)~share_F+APL+np.log(Standardized_population)+gender+Fibre+np.log(Deces+1)+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table2 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table2: \n{table2}\n')

results.summary(): 
                             OLS Regression Results                             
Dep. Variable:     np.log(Nb_doct_30km)   R-squared:                       0.862
Model:                              OLS   Adj. R-squared:                  0.862
Method:                   Least Squares   F-statistic:                     529.7
Date:                  Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                          09:53:18   Log-Likelihood:                -4550.2
No. Observations:                  5294   AIC:                             9130.
Df Residuals:                      5279   BIC:                             9229.
Df Model:                            14                                         
Covariance Type:                cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

In [94]:
reg = smf.ols(formula='np.log(Nb_doct_30km)~share_F+APL+np.log(Standardized_population)+gender+np.log(Deces+1)+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table3 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table3: \n{table3}\n')

results.summary(): 
                             OLS Regression Results                             
Dep. Variable:     np.log(Nb_doct_30km)   R-squared:                       0.862
Model:                              OLS   Adj. R-squared:                  0.862
Method:                   Least Squares   F-statistic:                     572.9
Date:                  Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                          09:53:14   Log-Likelihood:                -4551.9
No. Observations:                  5294   AIC:                             9132.
Df Residuals:                      5280   BIC:                             9224.
Df Model:                            13                                         
Covariance Type:                cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

In [97]:
#Comparison

# Combine the results
results_combined = pd.concat([table1, table2, table3], axis=1)

# Exporter le tableau en LaTeX
latex_table = results_combined.to_latex(index=True, float_format="%.3f",
                                        caption="Comparison of different instruments",
                                        label="tab:comparison_instruments")
print(latex_table)

\begin{table}
\caption{Comparison of different instruments}
\label{tab:comparison_instruments}
\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value \\
\midrule
Intercept & 5.869 & 1.528 & 3.842 & 0.000 & 5.795 & 1.503 & 3.856 & 0.000 & 5.693 & 1.511 & 3.769 & 0.000 \\
share_F & 0.078 & 0.074 & 1.056 & 0.291 & 0.063 & 0.073 & 0.860 & 0.390 & 0.062 & 0.073 & 0.843 & 0.399 \\
APL & 0.082 & 0.017 & 4.848 & 0.000 & 0.078 & 0.017 & 4.663 & 0.000 & 0.078 & 0.017 & 4.675 & 0.000 \\
np.log(Standardized_population) & -1.183 & 0.020 & -57.991 & 0.000 & -1.186 & 0.020 & -58.666 & 0.000 & -1.186 & 0.020 & -58.623 & 0.000 \\
gender & 0.008 & 0.017 & 0.476 & 0.634 & 0.006 & 0.017 & 0.357 & 0.721 & 0.006 & 0.017 & 0.360 & 0.719 \\
Fibre & 0.092 & 0.088 & 1.037 & 0.300 & 0.095 & 0.088 & 1.089 & 0.276 & NaN & NaN & NaN & NaN \\
np.log(MED14) & 0.997 & 0.180 & 5.543 & 0.000 & 0.95

In [98]:
reg = smf.ols(formula='np.log(Nb_doct_D_30km)~share_F+APL+np.log(Standardized_population)+gender+Fibre+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
tabled1 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'tabled1: \n{tabled1}\n')

results.summary(): 
                              OLS Regression Results                              
Dep. Variable:     np.log(Nb_doct_D_30km)   R-squared:                       0.827
Model:                                OLS   Adj. R-squared:                  0.827
Method:                     Least Squares   F-statistic:                     471.4
Date:                    Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                            09:54:46   Log-Likelihood:                -5267.5
No. Observations:                    5294   AIC:                         1.056e+04
Df Residuals:                        5280   BIC:                         1.066e+04
Df Model:                              13                                         
Covariance Type:                  cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------

In [99]:
reg = smf.ols(formula='np.log(Nb_doct_D_30km)~share_F+APL+np.log(Standardized_population)+gender+Fibre+np.log(Deces+1)+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
tabled2 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'tabled2: \n{tabled2}\n')

results.summary(): 
                              OLS Regression Results                              
Dep. Variable:     np.log(Nb_doct_D_30km)   R-squared:                       0.828
Model:                                OLS   Adj. R-squared:                  0.828
Method:                     Least Squares   F-statistic:                     448.0
Date:                    Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                            09:54:59   Log-Likelihood:                -5248.5
No. Observations:                    5294   AIC:                         1.053e+04
Df Residuals:                        5279   BIC:                         1.063e+04
Df Model:                              14                                         
Covariance Type:                  cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------

In [100]:
reg = smf.ols(formula='np.log(Nb_doct_D_30km)~share_F+APL+np.log(Standardized_population)+gender+np.log(Deces+1)+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
tabled3 = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table: \n{tabled3}\n')

latex_table = table.to_latex(index=True, header=True, column_format='lcccccc', float_format="%.4f")

print(latex_table)

results.summary(): 
                              OLS Regression Results                              
Dep. Variable:     np.log(Nb_doct_D_30km)   R-squared:                       0.828
Model:                                OLS   Adj. R-squared:                  0.828
Method:                     Least Squares   F-statistic:                     485.3
Date:                    Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                            09:55:20   Log-Likelihood:                -5252.4
No. Observations:                    5294   AIC:                         1.053e+04
Df Residuals:                        5280   BIC:                         1.062e+04
Df Model:                              13                                         
Covariance Type:                  cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------

In [101]:
#Comparison

# Combine the results
results_combined = pd.concat([tabled1, tabled2, tabled3], axis=1)

# Exporter le tableau en LaTeX
latex_table = results_combined.to_latex(index=True)
print(latex_table)  # pour afficher dans Python

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value \\
\midrule
Intercept & 1.047600 & 1.725300 & 0.607200 & 0.543700 & 0.961700 & 1.688800 & 0.569500 & 0.569000 & 0.789100 & 1.687000 & 0.467800 & 0.640000 \\
share_F & 0.097300 & 0.083900 & 1.159100 & 0.246400 & 0.080400 & 0.083600 & 0.961700 & 0.336200 & 0.078300 & 0.083600 & 0.936400 & 0.349100 \\
APL & 0.066200 & 0.017500 & 3.774600 & 0.000200 & 0.062000 & 0.017500 & 3.535600 & 0.000400 & 0.061700 & 0.017500 & 3.530400 & 0.000400 \\
np.log(Standardized_population) & -1.151500 & 0.023400 & -49.312100 & 0.000000 & -1.154800 & 0.023100 & -49.923800 & 0.000000 & -1.155100 & 0.023100 & -49.917900 & 0.000000 \\
gender & -0.018300 & 0.019500 & -0.941000 & 0.346700 & -0.020600 & 0.019500 & -1.056700 & 0.290600 & -0.020500 & 0.019500 & -1.051900 & 0.292800 \\
Fibre & 0.158100 & 0.097800 & 1.615900 & 0.106100 & 0.1623

# Step 3 : We run final regressions

In [103]:
reg = smf.ols(formula='sum~np.log(Nb_doct_30km)+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table_ols = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table_ols}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                    sum   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     23.81
Date:                Thu, 07 Nov 2024   Prob (F-statistic):           4.57e-54
Time:                        10:27:11   Log-Likelihood:                -19096.
No. Observations:                5294   AIC:                         3.822e+04
Df Residuals:                    5280   BIC:                         3.831e+04
Df Model:                          13                                         
Covariance Type:              cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [102]:
reg_iv = iv.IV2SLS.from_formula(formula=' sum ~ 1 + [np.log(Nb_doct_30km) ~ np.log(Deces+1)] +share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss).fit(cov_type='clustered', clusters=dataFF_red_norm_gauss['codecommunecoordstructure3'])
# print regression table:
print(reg_iv)
table_iv = pd.DataFrame({
    'Coefficient': reg_iv.params,
    'Std Error': reg_iv.std_errors,
    't-value': reg_iv.tstats,
    'p-value': reg_iv.pvalues
})

                          IV-2SLS Estimation Summary                          
Dep. Variable:                    sum   R-squared:                     -0.0243
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0269
No. Observations:                5294   F-statistic:                    191.10
Date:                Thu, Nov 07 2024   P-value (F-stat)                0.0000
Time:                        10:27:00   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                        Parameter Estimates                                        
                                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------------
Intercept                          -90.415     23.372    -3.8685    

In [104]:
results_combined = pd.concat([table_ols, table_iv], axis=1)

# Exporter le tableau en LaTeX
latex_table = results_combined.to_latex(index=True, float_format="%.3f",
                                        caption="Comparison of OLS and 2SLS results",
                                        label="tab:comparaison_ols_2sls")
print(latex_table)

\begin{table}
\caption{Comparison of OLS and 2SLS results}
\label{tab:comparaison_ols_2sls}
\begin{tabular}{lrrrrrrrr}
\toprule
 & b & se & t & pval & Coefficient & Std Error & t-value & p-value \\
\midrule
Intercept & -64.424 & 14.556 & -4.426 & 0.000 & -90.415 & 23.372 & -3.868 & 0.000 \\
np.log(Nb_doct_30km) & 1.055 & 0.223 & 4.735 & 0.000 & 5.558 & 2.828 & 1.965 & 0.049 \\
share_F & -0.381 & 0.768 & -0.496 & 0.620 & -0.726 & 0.888 & -0.818 & 0.413 \\
APL & 0.329 & 0.126 & 2.619 & 0.009 & -0.040 & 0.267 & -0.150 & 0.881 \\
np.log(Standardized_population) & 1.957 & 0.337 & 5.808 & 0.000 & 7.286 & 3.355 & 2.172 & 0.030 \\
gender & -1.041 & 0.399 & -2.612 & 0.009 & -1.077 & 0.393 & -2.740 & 0.006 \\
np.log(MED14) & 3.628 & 1.690 & 2.147 & 0.032 & -0.937 & 3.372 & -0.278 & 0.781 \\
np.log(Superficie) & -0.256 & 0.193 & -1.322 & 0.186 & 0.897 & 0.744 & 1.206 & 0.228 \\
np.log(Ménages) & -4.448 & 1.287 & -3.456 & 0.001 & -1.982 & 2.235 & -0.887 & 0.375 \\
np.log(Chomeurs) & 0.073 & 0.758 

In [106]:
reg = smf.ols(formula='sum~np.log(Nb_doct_D_30km)+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table_D_OLS = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table: \n{table_D_OLS}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                    sum   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.051
Method:                 Least Squares   F-statistic:                     23.33
Date:                Thu, 07 Nov 2024   Prob (F-statistic):           6.14e-53
Time:                        10:32:15   Log-Likelihood:                -19100.
No. Observations:                5294   AIC:                         3.823e+04
Df Residuals:                    5280   BIC:                         3.832e+04
Df Model:                          13                                         
Covariance Type:              cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [105]:
reg_iv = iv.IV2SLS.from_formula(formula=' sum ~ 1 + [np.log(Nb_doct_D_30km) ~ np.log(Deces+1)] +share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss).fit(cov_type='clustered', clusters=dataFF_red_norm_gauss['codecommunecoordstructure3'])
# print regression table:
print(reg_iv)
table_D_iv = pd.DataFrame({
    'Coefficient': reg_iv.params,
    'Std Error': reg_iv.std_errors,
    't-value': reg_iv.tstats,
    'p-value': reg_iv.pvalues
})

                          IV-2SLS Estimation Summary                          
Dep. Variable:                    sum   R-squared:                     -0.0294
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0320
No. Observations:                5294   F-statistic:                    197.33
Date:                Thu, Nov 07 2024   P-value (F-stat)                0.0000
Time:                        10:32:11   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                        Parameter Estimates                                        
                                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------------
Intercept                          -62.587     16.270    -3.8467    

In [107]:
results_combined = pd.concat([table_D_OLS, table_D_iv], axis=1)

# Exporter le tableau en LaTeX
latex_table = results_combined.to_latex(index=True, float_format="%.3f",
                                        caption="Comparison of OLS and 2SLS results",
                                        label="tab:comparaison_ols_2sls")
print(latex_table)

\begin{table}
\caption{Comparison of OLS and 2SLS results}
\label{tab:comparaison_ols_2sls}
\begin{tabular}{lrrrrrrrr}
\toprule
 & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value \\
\midrule
Intercept & -59.034 & 14.516 & -4.067 & 0.000 & -62.587 & 16.270 & -3.847 & 0.000 \\
np.log(Nb_doct_D_30km) & 0.792 & 0.188 & 4.214 & 0.000 & 4.834 & 2.660 & 1.817 & 0.069 \\
share_F & -0.376 & 0.769 & -0.488 & 0.625 & -0.760 & 0.903 & -0.842 & 0.400 \\
APL & 0.363 & 0.124 & 2.921 & 0.004 & 0.096 & 0.221 & 0.436 & 0.663 \\
np.log(Standardized_population) & 1.621 & 0.296 & 5.477 & 0.000 & 6.276 & 3.065 & 2.047 & 0.041 \\
gender & -1.019 & 0.401 & -2.540 & 0.011 & -0.945 & 0.407 & -2.325 & 0.020 \\
np.log(MED14) & 3.687 & 1.694 & 2.177 & 0.029 & -1.468 & 3.878 & -0.378 & 0.705 \\
np.log(Superficie) & -0.252 & 0.196 & -1.284 & 0.199 & 1.144 & 0.920 & 1.244 & 0.214 \\
np.log(Ménages) & -4.368 & 1.317 & -3.317 & 0.001 & -1.012 & 3.179 & -0.318 & 0.750 \\
np.log(

In [84]:
reg = smf.ols(formula='sum~np.log(same_gender_30km)+gender+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss)
results = reg.fit(cov_type='cluster', cov_kwds={'groups': dataFF_red_norm_gauss['codecommunecoordstructure3']})
b = results.params
#print(f'b: \n{b}\n')

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table_g_OLS = pd.DataFrame({'Coefficient': round(results.params, 4),
                      'Std Error': round(results.bse, 4),
                      't-value': round(results.tvalues, 4),
                      'p-value': round(results.pvalues, 4)})
print(f'table: \n{table_D_OLS}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                    sum   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     23.61
Date:                Thu, 07 Nov 2024   Prob (F-statistic):           1.38e-53
Time:                        09:21:41   Log-Likelihood:                -19097.
No. Observations:                5294   AIC:                         3.822e+04
Df Residuals:                    5280   BIC:                         3.831e+04
Df Model:                          13                                         
Covariance Type:              cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [108]:
reg_iv = iv.IV2SLS.from_formula(formula=' sum ~ 1 + [np.log(same_gender_30km) ~ np.log(Deces+1)] +gender+share_F+APL+np.log(Standardized_population)+gender+np.log(MED14)+np.log(Superficie)+np.log(Ménages)+np.log(Chomeurs)+np.log(Actifs)+np.log(Population_P_actif)+np.log(Naissances+1)+np.log(Logements)', data=dataFF_red_norm_gauss).fit(cov_type='clustered', clusters=dataFF_red_norm_gauss['codecommunecoordstructure3'])
# print regression table:
print(reg_iv)
table_g_iv = pd.DataFrame({
    'Coefficient': reg_iv.params,
    'Std Error': reg_iv.std_errors,
    't-value': reg_iv.tstats,
    'p-value': reg_iv.pvalues
})

                          IV-2SLS Estimation Summary                          
Dep. Variable:                    sum   R-squared:                     -0.0381
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0407
No. Observations:                5294   F-statistic:                    179.31
Date:                Thu, Nov 07 2024   P-value (F-stat)                0.0000
Time:                        10:34:07   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                        Parameter Estimates                                        
                                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------------
Intercept                          -86.568     22.240    -3.8925    

In [109]:
results_combined = pd.concat([table_g_OLS, table_g_iv], axis=1)

# Exporter le tableau en LaTeX
latex_table = results_combined.to_latex(index=True, float_format="%.3f",
                                        caption="Comparison of OLS and 2SLS results same gender",
                                        label="tab:comparaison_ols_2sls_same_gender")
print(latex_table)

\begin{table}
\caption{Comparison of OLS and 2SLS results same gender}
\label{tab:comparaison_ols_2sls_same_gender}
\begin{tabular}{lrrrrrrrr}
\toprule
 & Coefficient & Std Error & t-value & p-value & Coefficient & Std Error & t-value & p-value \\
\midrule
Intercept & -63.307 & 14.519 & -4.360 & 0.000 & -86.568 & 22.240 & -3.892 & 0.000 \\
np.log(same_gender_30km) & 1.023 & 0.215 & 4.764 & 0.000 & 5.813 & 2.986 & 1.947 & 0.052 \\
gender & -1.115 & 0.404 & -2.759 & 0.006 & -1.499 & 0.490 & -3.058 & 0.002 \\
share_F & -0.304 & 0.774 & -0.393 & 0.695 & -0.321 & 0.896 & -0.358 & 0.721 \\
APL & 0.328 & 0.126 & 2.607 & 0.009 & -0.077 & 0.286 & -0.270 & 0.787 \\
np.log(Standardized_population) & 1.917 & 0.324 & 5.916 & 0.000 & 7.572 & 3.532 & 2.144 & 0.032 \\
np.log(MED14) & 3.636 & 1.692 & 2.150 & 0.032 & -1.327 & 3.571 & -0.372 & 0.710 \\
np.log(Superficie) & -0.263 & 0.192 & -1.369 & 0.171 & 0.969 & 0.785 & 1.234 & 0.217 \\
np.log(Ménages) & -4.468 & 1.287 & -3.473 & 0.001 & -1.861 & 2.324