In [1]:
import pandas as pd
import numpy as np
import pyarrow
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns

In [2]:
data = pd.read_parquet('final_enem.parquet')

In [3]:
df = data.sample(frac=0.05,random_state=200).reset_index(drop=True)

In [4]:
# A: 19960,01 (para cima)
# B: 9980,01 - 19960 (10 a 20 salarios minimos)
# C: 3992,01 - 9980 (4 a 10 salarios minimos)
# D: 1996,01 - 3992 (2 a 4 salarios minimos)
# E: 0 - 1996 (dois salarios minimos)

df.loc[df['Q006'] <= 3, 'Q006'] = 1
df.loc[((df['Q006'] > 3) & (df['Q006'] <= 6)), 'Q006'] = 2
df.loc[((df['Q006'] > 6) & (df['Q006'] <= 12)), 'Q006'] = 3
df.loc[((df['Q006'] > 12) & (df['Q006'] <= 15)), 'Q006'] = 4
df.loc[df['Q006'] > 15, 'Q006'] = 5

## Renomeando as variaveis

In [5]:
df = df.rename(columns = {'Q001': 'FatherFormation', 'Q002': 'MotherFormation', 
                          'Q003': 'FatherProfession', 'Q004': 'MotherProfession',
                         'Q006': 'Income','Q025': 'Internet', 'TP_COR_RACA': 'ColorRace', 'REGION':'Region', 'HOUSE': 'House'}, inplace = False)

In [6]:
df = df.drop(columns=['CO_UF_RESIDENCIA', 'Q008', 'Q009'])

In [7]:
df

Unnamed: 0,NU_NOTA_MT,ColorRace,TP_ESCOLA,FatherFormation,MotherFormation,FatherProfession,MotherProfession,Income,Internet,Region,House
0,549.0,1,2,4,5,2,2,1,1,3,4
1,643.4,3,2,2,4,3,1,1,0,2,3
2,493.9,3,2,5,2,4,5,3,1,5,5
3,390.5,3,2,0,1,0,0,1,0,2,2
4,724.5,1,2,5,4,3,1,1,1,3,4
...,...,...,...,...,...,...,...,...,...,...,...
59661,535.1,3,2,2,4,2,2,1,1,1,4
59662,472.9,1,2,3,4,5,1,3,1,4,4
59663,515.4,1,2,4,2,5,1,2,1,4,5
59664,425.0,1,2,5,4,5,3,3,1,3,4


In [8]:
df['Public'] = [1 if x==2 else 0 for x in df['TP_ESCOLA']]

In [9]:
df = df.drop(columns=['TP_ESCOLA'])

In [10]:
df

Unnamed: 0,NU_NOTA_MT,ColorRace,FatherFormation,MotherFormation,FatherProfession,MotherProfession,Income,Internet,Region,House,Public
0,549.0,1,4,5,2,2,1,1,3,4,1
1,643.4,3,2,4,3,1,1,0,2,3,1
2,493.9,3,5,2,4,5,3,1,5,5,1
3,390.5,3,0,1,0,0,1,0,2,2,1
4,724.5,1,5,4,3,1,1,1,3,4,1
...,...,...,...,...,...,...,...,...,...,...,...
59661,535.1,3,2,4,2,2,1,1,1,4,1
59662,472.9,1,3,4,5,1,3,1,4,4,1
59663,515.4,1,4,2,5,1,2,1,4,5,1
59664,425.0,1,5,4,5,3,3,1,3,4,1


## Regressao

In [11]:
import statsmodels.formula.api as smf
lm1 = smf.ols('NU_NOTA_MT ~ + C(Income)', data = df).fit()
lm1.summary()

0,1,2,3
Dep. Variable:,NU_NOTA_MT,R-squared:,0.218
Model:,OLS,Adj. R-squared:,0.217
Method:,Least Squares,F-statistic:,4146.0
Date:,"Wed, 23 Jun 2021",Prob (F-statistic):,0.0
Time:,16:51:16,Log-Likelihood:,-356870.0
No. Observations:,59666,AIC:,713800.0
Df Residuals:,59661,BIC:,713800.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,489.5969,0.503,973.510,0.000,488.611,490.583
C(Income)[T.2],53.1005,0.991,53.599,0.000,51.159,55.042
C(Income)[T.3],106.2699,1.199,88.647,0.000,103.920,108.620
C(Income)[T.4],170.3822,2.091,81.468,0.000,166.283,174.481
C(Income)[T.5],205.6280,3.410,60.309,0.000,198.945,212.311

0,1,2,3
Omnibus:,1932.06,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2124.582
Skew:,0.461,Prob(JB):,0.0
Kurtosis:,3.08,Cond. No.,9.01


In [12]:
lm2 = smf.ols('NU_NOTA_MT ~ C(Income) + Internet', data = df).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,NU_NOTA_MT,R-squared:,0.228
Model:,OLS,Adj. R-squared:,0.228
Method:,Least Squares,F-statistic:,3516.0
Date:,"Wed, 23 Jun 2021",Prob (F-statistic):,0.0
Time:,16:51:16,Log-Likelihood:,-356480.0
No. Observations:,59666,AIC:,713000.0
Df Residuals:,59660,BIC:,713000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,470.3094,0.853,551.341,0.000,468.637,471.981
C(Income)[T.2],45.7628,1.019,44.917,0.000,43.766,47.760
C(Income)[T.3],97.6370,1.231,79.341,0.000,95.225,100.049
C(Income)[T.4],161.4646,2.102,76.802,0.000,157.344,165.585
C(Income)[T.5],196.5800,3.403,57.765,0.000,189.910,203.250
Internet,28.3707,1.017,27.898,0.000,26.377,30.364

0,1,2,3
Omnibus:,1797.446,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1963.416
Skew:,0.442,Prob(JB):,0.0
Kurtosis:,3.085,Cond. No.,11.6


In [13]:
lm3 = smf.ols('NU_NOTA_MT ~ C(Income) + Region + MotherProfession', data = df).fit()
lm3.summary()

0,1,2,3
Dep. Variable:,NU_NOTA_MT,R-squared:,0.225
Model:,OLS,Adj. R-squared:,0.225
Method:,Least Squares,F-statistic:,2883.0
Date:,"Wed, 23 Jun 2021",Prob (F-statistic):,0.0
Time:,16:51:16,Log-Likelihood:,-356590.0
No. Observations:,59666,AIC:,713200.0
Df Residuals:,59659,BIC:,713300.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,470.2811,1.179,398.991,0.000,467.971,472.591
C(Income)[T.2],46.9230,1.023,45.847,0.000,44.917,48.929
C(Income)[T.3],96.0016,1.270,75.590,0.000,93.512,98.491
C(Income)[T.4],156.8319,2.160,72.612,0.000,152.599,161.065
C(Income)[T.5],189.8632,3.461,54.865,0.000,183.081,196.646
Region,4.0742,0.384,10.618,0.000,3.322,4.826
MotherProfession,6.0324,0.288,20.914,0.000,5.467,6.598

0,1,2,3
Omnibus:,1845.66,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2021.115
Skew:,0.449,Prob(JB):,0.0
Kurtosis:,3.075,Cond. No.,33.1


In [14]:
lm4 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Region) + C(MotherProfession) + Internet', data = df).fit()
lm4.summary()

0,1,2,3
Dep. Variable:,NU_NOTA_MT,R-squared:,0.247
Model:,OLS,Adj. R-squared:,0.247
Method:,Least Squares,F-statistic:,1397.0
Date:,"Wed, 23 Jun 2021",Prob (F-statistic):,0.0
Time:,16:51:17,Log-Likelihood:,-355730.0
No. Observations:,59666,AIC:,711500.0
Df Residuals:,59651,BIC:,711600.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,445.4119,1.600,278.354,0.000,442.276,448.548
C(Income)[T.2],35.5009,1.062,33.431,0.000,33.420,37.582
C(Income)[T.3],77.8155,1.350,57.629,0.000,75.169,80.462
C(Income)[T.4],132.5421,2.273,58.313,0.000,128.087,136.997
C(Income)[T.5],159.6971,3.622,44.096,0.000,152.599,166.795
C(Region)[T.2],18.7996,1.442,13.039,0.000,15.974,21.625
C(Region)[T.3],26.5373,1.450,18.308,0.000,23.696,29.378
C(Region)[T.4],26.0043,1.736,14.982,0.000,22.602,29.406
C(Region)[T.5],10.4209,1.850,5.633,0.000,6.795,14.047

0,1,2,3
Omnibus:,1638.731,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1776.047
Skew:,0.42,Prob(JB):,0.0
Kurtosis:,3.094,Cond. No.,15.3


In [15]:
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
import operator

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

ordem = list(lm4.params.index)

results_table = summary_col(results=[lm1,lm2, lm3, lm4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,470.31***,470.28***,445.41***
,(0.50),(0.85),(1.18),(1.60)
C(Income)[T.2],53.10***,45.76***,46.92***,35.50***
,(0.99),(1.02),(1.02),(1.06)
C(Income)[T.3],106.27***,97.64***,96.00***,77.82***
,(1.20),(1.23),(1.27),(1.35)
C(Income)[T.4],170.38***,161.46***,156.83***,132.54***
,(2.09),(2.10),(2.16),(2.27)
C(Income)[T.5],205.63***,196.58***,189.86***,159.70***


## Interactions

In [16]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public) + Internet', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public) + Internet + C(Public)*Internet', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,549.14***,530.31***,527.87***
,(0.50),(1.24),(1.45),(5.23)
C(Income)[T.2],53.10***,45.57***,39.42***,39.42***
,(0.99),(0.98),(1.01),(1.01)
C(Income)[T.3],106.27***,81.18***,74.49***,74.46***
,(1.20),(1.27),(1.29),(1.29)
C(Income)[T.4],170.38***,125.75***,119.47***,119.40***
,(2.09),(2.22),(2.22),(2.22)
C(Income)[T.5],205.63***,151.00***,144.95***,144.85***


In [17]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + House', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + House + C(Region)', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + House + C(Region) + House*C(Region)', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,456.12***,436.12***,453.78***
,(0.50),(1.28),(1.70),(3.69)
C(Income)[T.2],53.10***,45.66***,41.59***,41.56***
,(0.99),(1.02),(1.05),(1.05)
C(Income)[T.3],106.27***,90.34***,86.20***,86.12***
,(1.20),(1.32),(1.34),(1.34)
C(Income)[T.4],170.38***,144.06***,140.33***,140.14***
,(2.09),(2.27),(2.28),(2.28)
C(Income)[T.5],205.63***,169.23***,165.13***,164.38***


In [18]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + House', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public) + Internet', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public) + Internet + C(Public)*Internet', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,456.12***,530.31***,527.87***
,(0.50),(1.28),(1.45),(5.23)
C(Income)[T.2],53.10***,45.66***,39.42***,39.42***
,(0.99),(1.02),(1.01),(1.01)
C(Income)[T.3],106.27***,90.34***,74.49***,74.46***
,(1.20),(1.32),(1.29),(1.29)
C(Income)[T.4],170.38***,144.06***,119.47***,119.40***
,(2.09),(2.27),(2.22),(2.22)
C(Income)[T.5],205.63***,169.23***,144.95***,144.85***


In [19]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace) + C(Public)', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace) + C(Public) + C(ColorRace)*C(Public)', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,508.99***,563.23***,565.55***
,(0.50),(0.78),(1.32),(1.49)
C(Income)[T.2],53.10***,47.03***,40.63***,40.79***
,(0.99),(1.00),(0.99),(0.99)
C(Income)[T.3],106.27***,96.82***,74.13***,73.89***
,(1.20),(1.22),(1.28),(1.28)
C(Income)[T.4],170.38***,157.83***,116.93***,116.12***
,(2.09),(2.11),(2.22),(2.23)
C(Income)[T.5],205.63***,190.39***,140.32***,138.76***


In [20]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace) + C(Region)', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + C(ColorRace) + C(Region) + C(ColorRace)*C(Region)', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,508.99***,489.12***,480.14***
,(0.50),(0.78),(1.49),(3.01)
C(Income)[T.2],53.10***,47.03***,45.30***,45.31***
,(0.99),(1.00),(1.02),(1.02)
C(Income)[T.3],106.27***,96.82***,95.17***,95.05***
,(1.20),(1.22),(1.24),(1.24)
C(Income)[T.4],170.38***,157.83***,156.45***,156.27***
,(2.09),(2.11),(2.11),(2.11)
C(Income)[T.5],205.63***,190.39***,188.48***,188.17***


In [21]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + House', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + House + C(MotherFormation)', data = df).fit()
l4 = smf.ols('NU_NOTA_MT ~ C(Income) + House + C(MotherFormation) + House*C(MotherFormation)', data = df).fit()

ordem = list(l4.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3,l4],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3,4
,Model 1,Model 2,Model 3,Model 4
Intercept,489.60***,456.12***,430.20***,452.81***
,(0.50),(1.28),(2.89),(8.14)
C(Income)[T.2],53.10***,45.66***,35.64***,36.21***
,(0.99),(1.02),(1.03),(1.04)
C(Income)[T.3],106.27***,90.34***,72.29***,71.78***
,(1.20),(1.32),(1.38),(1.39)
C(Income)[T.4],170.38***,144.06***,120.78***,117.30***
,(2.09),(2.27),(2.34),(2.38)
C(Income)[T.5],205.63***,169.23***,144.84***,138.01***


In [26]:
df

Unnamed: 0,NU_NOTA_MT,ColorRace,FatherFormation,MotherFormation,FatherProfession,MotherProfession,Income,Internet,Region,House,Public
0,549.0,1,4,5,2,2,1,1,3,4,1
1,643.4,3,2,4,3,1,1,0,2,3,1
2,493.9,3,5,2,4,5,3,1,5,5,1
3,390.5,3,0,1,0,0,1,0,2,2,1
4,724.5,1,5,4,3,1,1,1,3,4,1
...,...,...,...,...,...,...,...,...,...,...,...
59661,535.1,3,2,4,2,2,1,1,1,4,1
59662,472.9,1,3,4,5,1,3,1,4,4,1
59663,515.4,1,4,2,5,1,2,1,4,5,1
59664,425.0,1,5,4,5,3,3,1,3,4,1


In [None]:
### Reduzindo variaveis

In [27]:
df['NotWhite'] = [1 if x!=1 else 0 for x in df['ColorRace']]

In [31]:
df.loc[df['FatherFormation'] <= 3, 'FatherFormation'] = 1
df.loc[((df['FatherFormation'] > 3) & (df['FatherFormation'] <= 4)), 'FatherFormation'] = 2
df.loc[((df['FatherFormation'] > 4) & (df['FatherFormation'] <= 6)), 'FatherFormation'] = 3
df.loc[df['FatherFormation'] == 7, 'FatherFormation'] = 1

In [39]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(NotWhite)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(NotWhite) + C(Income)*C(NotWhite)', data = df).fit()


ordem = list(l3.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3
,Model 1,Model 2,Model 3
Intercept,489.60***,508.90***,506.58***
,(0.50),(0.78),(0.93)
C(Income)[T.2],53.10***,47.11***,49.28***
,(0.99),(1.00),(1.51)
C(Income)[T.3],106.27***,97.02***,103.52***
,(1.20),(1.22),(1.65)
C(Income)[T.4],170.38***,158.12***,163.37***
,(2.09),(2.11),(2.52)
C(Income)[T.5],205.63***,190.72***,193.04***


In [38]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + House', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + House + C(Income)*House', data = df).fit()


ordem = list(l3.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3
,Model 1,Model 2,Model 3
Intercept,489.60***,456.12***,459.18***
,(0.50),(1.28),(1.75)
C(Income)[T.2],53.10***,45.66***,39.80***
,(0.99),(1.02),(3.54)
C(Income)[T.3],106.27***,90.34***,76.16***
,(1.20),(1.32),(4.40)
C(Income)[T.4],170.38***,144.06***,145.87***
,(2.09),(2.27),(8.98)
C(Income)[T.5],205.63***,169.23***,209.78***


In [37]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(Public) + C(Income)*C(Public)', data = df).fit()


ordem = list(l3.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3
,Model 1,Model 2,Model 3
Intercept,489.60***,549.14***,551.14***
,(0.50),(1.24),(2.10)
C(Income)[T.2],53.10***,45.57***,42.68***
,(0.99),(0.98),(2.90)
C(Income)[T.3],106.27***,81.18***,79.59***
,(1.20),(1.27),(2.63)
C(Income)[T.4],170.38***,125.75***,122.53***
,(2.09),(2.22),(3.09)
C(Income)[T.5],205.63***,151.00***,147.09***


In [36]:
l1 = smf.ols('NU_NOTA_MT ~ C(Income)', data = df).fit()
l2 = smf.ols('NU_NOTA_MT ~ C(Income) + C(FatherFormation)', data = df).fit()
l3 = smf.ols('NU_NOTA_MT ~ C(Income) + C(FatherFormation) + C(Income)*C(FatherFormation)', data = df).fit()


ordem = list(l3.params.index)

info_dict={
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[l1,l2,l3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                          'Model 3',
                                        'Model 4',
                                        'Model 5'],
                            info_dict=info_dict,
                            regressor_order=ordem)

results_table.add_title('Table OLS Regressions')

results_table

0,1,2,3
,Model 1,Model 2,Model 3
Intercept,489.60***,482.03***,481.86***
,(0.50),(0.55),(0.59)
C(Income)[T.2],53.10***,44.24***,46.41***
,(0.99),(1.00),(1.37)
C(Income)[T.3],106.27***,84.28***,81.60***
,(1.20),(1.30),(2.34)
C(Income)[T.4],170.38***,135.40***,129.26***
,(2.09),(2.25),(6.93)
C(Income)[T.5],205.63***,166.24***,143.19***
