# <span style="color:darkblue"> Lecture 11 (Optional): Regression Output </span>

<font size = "5">

This is an optional lecture file

- This is only recommended if you've taken statistics courses 
- This lecture will not be formally evaluated
- Keep this in material in mind for future courses


# <span style="color:darkblue"> I. Import Libraries </span>


In [1]:
# The "pandas" library is used for processing datasets
# The "numpy" is for numeric observations and random numbers
# The "matplotlib.pyplot" library is for creating graphs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<font size = "5">

Install the "statsmodels" library
- Run "pip3 install statsmodels" in the terminal
- Automatically included in Anaconda

In [2]:
# We will "alias" two sublibraries in "statsmodels"
# "statsmodels.formula.api" contains functions to estimate models
# "statsmodels.api" contains general-use statistical options

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


# <span style="color:darkblue"> II. Generate Simulated Data </span>

<font size = "5">

Create an empty dataset

In [3]:
dataset = pd.DataFrame([])

<font size = "5">

Create three random variables of size ($n = 100$)

In [4]:
n = 100
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["z"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)


<font size = "5">

Create discre random variable ($n = 100$)

In [5]:
dataset["d"] = np.random.choice(a = [1,2,3],
                                size = n,
                                p = [0.2,0.2,0.6])

<font size = "5">

Create data from the linear model

$ y = 2 + 5 x + e$

In [6]:
# We can compute formulas directly over dataset columns
dataset["y"] =2 + 5* dataset["x"] + dataset["x"]*dataset["e"]

# <span style="color:darkblue"> III. Regression Tables </span>


<font size = "5">

Summaries for univariate regression

In [7]:
# Run the model with multiple variables by using "+"
results_univariate = smf.ols(formula = 'y ~ x',data = dataset).fit(cov_type= "HC1")

# The "summary_col" functions produces nice outputs
# We can add notation for significance by setting "stars" to True
print(summary_col(results_univariate,
                  stars = True))




                   y    
------------------------
Intercept      1.8078***
               (0.1253) 
x              4.8452***
               (0.1975) 
R-squared      0.9455   
R-squared Adj. 0.9449   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


In [8]:
print(results_univariate.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.945
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     602.1
Date:                Fri, 24 Feb 2023   Prob (F-statistic):           1.24e-43
Time:                        21:11:03   Log-Likelihood:                -161.47
No. Observations:                 100   AIC:                             326.9
Df Residuals:                      98   BIC:                             332.2
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8078      0.125     14.428      0.0

<font size = "5">

Summaries for multivariate regression

In [9]:
# Run the model with multiple variables by using "+"
results_multivariate = smf.ols(formula = 'y ~ x + z',
                               data = dataset).fit(cov_type = "HC1")
print(summary_col(results_multivariate,
                  stars = True))


                   y    
------------------------
Intercept      1.8078***
               (0.1260) 
x              4.8426***
               (0.1954) 
z              -0.0175  
               (0.1432) 
R-squared      0.9455   
R-squared Adj. 0.9443   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + categories

In [10]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_category = smf.ols(formula = 'y ~ x + C(d)',
                                        data = dataset).fit(cov_type = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_category,
                  stars = True))


                   y    
------------------------
Intercept      1.5351***
               (0.2352) 
C(d)[T.2]      -0.0509  
               (0.3288) 
C(d)[T.3]      0.5085*  
               (0.2935) 
x              4.8138***
               (0.1944) 
R-squared      0.9480   
R-squared Adj. 0.9464   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + interaction

In [11]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_interaction = smf.ols(formula = 'y ~ x + z + z:x',
                                        data = dataset).fit(cov_type = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_interaction,
                  stars = True))


                   y    
------------------------
Intercept      1.8112***
               (0.1189) 
x              4.8424***
               (0.1954) 
z              -0.0177  
               (0.1450) 
z:x            0.0199   
               (0.2203) 
R-squared      0.9455   
R-squared Adj. 0.9438   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


# <span style="color:darkblue"> IV. Professional Tables </span>


<font size = "5">

Summaries for multiple columns

In [12]:
list_results = [results_univariate,
                results_multivariate,
                results_multivariate_category,
                results_multivariate_interaction]

print(summary_col(list_results,
                  stars = True))



                  y I       y II     y III     y IIII 
------------------------------------------------------
C(d)[T.2]                          -0.0509            
                                   (0.3288)           
C(d)[T.3]                          0.5085*            
                                   (0.2935)           
Intercept      1.8078*** 1.8078*** 1.5351*** 1.8112***
               (0.1253)  (0.1260)  (0.2352)  (0.1189) 
R-squared      0.9455    0.9455    0.9480    0.9455   
R-squared Adj. 0.9449    0.9443    0.9464    0.9438   
x              4.8452*** 4.8426*** 4.8138*** 4.8424***
               (0.1975)  (0.1954)  (0.1944)  (0.1954) 
z                        -0.0175             -0.0177  
                         (0.1432)            (0.1450) 
z:x                                          0.0199   
                                             (0.2203) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


<font size = "5">

Summaries for multiple columns (sorted + titled + stats)

In [13]:
# This list inputs the headings of the table
list_headings   = ["Univariate",
                   "Multivariate",
                   "Categorical",
                   "Interaction"]

# This is the list of regressor names (if you want a particular order)
list_regressors = ["x",
                   "z",
                   "z:x",
                   "C(d)[T.2]",
                   "C(d)[T.3]"]

# This is a function that extracts the sample size
# Can use with other summary statistics
# "nobs" is the number of observations
compute_summary = {'N':lambda model: format(int(model.nobs))}

print(summary_col(list_results,
                  stars = True,
                  model_names = list_headings,
                  info_dict={'N':lambda x: format(int(x.nobs))},
                  regressor_order = ["x","z","z:x","C(d)[T.2]","C(d)[T.3]"]))


               Univariate Multivariate Categorical Interaction
--------------------------------------------------------------
x              4.8452***  4.8426***    4.8138***   4.8424***  
               (0.1975)   (0.1954)     (0.1944)    (0.1954)   
z                         -0.0175                  -0.0177    
                          (0.1432)                 (0.1450)   
z:x                                                0.0199     
                                                   (0.2203)   
C(d)[T.2]                              -0.0509                
                                       (0.3288)               
C(d)[T.3]                              0.5085*                
                                       (0.2935)               
Intercept      1.8078***  1.8078***    1.5351***   1.8112***  
               (0.1253)   (0.1260)     (0.2352)    (0.1189)   
R-squared      0.9455     0.9455       0.9480      0.9455     
R-squared Adj. 0.9449     0.9443       0.9464      0.9

<font size = "5">

Detailed table

In [14]:
# Detailed Summary
print(results_univariate.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.945
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     602.1
Date:                Fri, 24 Feb 2023   Prob (F-statistic):           1.24e-43
Time:                        21:11:18   Log-Likelihood:                -161.47
No. Observations:                 100   AIC:                             326.9
Df Residuals:                      98   BIC:                             332.2
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8078      0.125     14.428      0.0