# Models with Categorical Variables

We start simulating a dataframe to explain the different functions we will use for fitting a linear model with categorical terms.

In [17]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Set random seed for reproducibility
np.random.seed(2023)

# Create a simulated dataset
n = 1000  # number of samples
early_career_pay = np.random.normal(50000, 10000, n)
public_or_private = np.random.randint(0, 2, n)
reputation_score = np.random.randint(1, 6, n)
noise = np.random.normal(0, 5000, n)
annual_salary = 0.9 * early_career_pay + 5000 * public_or_private + 2000 * reputation_score + noise
data = pd.DataFrame({
    'Annual_Salary': annual_salary,
    'Early_Career_Pay': early_career_pay,
    'Public_or_Private': public_or_private,
    'Reputation_Score': reputation_score
})
data

Unnamed: 0,Annual_Salary,Early_Career_Pay,Public_or_Private,Reputation_Score
0,52330.391168,57116.735303,0,1
1,40520.927758,46755.150439,1,1
2,47734.334911,39981.293613,1,3
3,67768.727483,52362.507940,1,5
4,47954.116941,48978.401587,1,1
...,...,...,...,...
995,30948.897374,37450.379615,0,2
996,65291.467850,59629.237464,0,3
997,55183.744078,51131.116617,1,2
998,59608.572030,57659.203319,1,5


In [18]:
# Create dummy variables for categorical predictors
data = pd.get_dummies(data, columns=['Public_or_Private', 'Reputation_Score'], drop_first=True)
data

Unnamed: 0,Annual_Salary,Early_Career_Pay,Public_or_Private_1,Reputation_Score_2,Reputation_Score_3,Reputation_Score_4,Reputation_Score_5
0,52330.391168,57116.735303,False,False,False,False,False
1,40520.927758,46755.150439,True,False,False,False,False
2,47734.334911,39981.293613,True,False,True,False,False
3,67768.727483,52362.507940,True,False,False,False,True
4,47954.116941,48978.401587,True,False,False,False,False
...,...,...,...,...,...,...,...
995,30948.897374,37450.379615,False,True,False,False,False
996,65291.467850,59629.237464,False,False,True,False,False
997,55183.744078,51131.116617,True,True,False,False,False
998,59608.572030,57659.203319,True,False,False,False,True


We can insert all the dummy variables by hand, or join the name of the columns using '+' as a separator.

In [19]:
# Create the Linear Regression Model Formula
model_formula = "Annual_Salary ~ Early_Career_Pay + " + " + ".join(data.columns[2:3])
model_formula

'Annual_Salary ~ Early_Career_Pay + Public_or_Private_1'

In [20]:
# Estimate the model with OLS
model = smf.ols(formula=model_formula, data=data).fit()
# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          Annual_Salary   R-squared:                       0.720
Model:                            OLS   Adj. R-squared:                  0.719
Method:                 Least Squares   F-statistic:                     1282.
Date:                Thu, 25 Apr 2024   Prob (F-statistic):          2.37e-276
Time:                        15:17:20   Log-Likelihood:                -10079.
No. Observations:                1000   AIC:                         2.016e+04
Df Residuals:                     997   BIC:                         2.018e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [21]:
# Import the Sales dataset
sales_data = pd.read_excel('/Users/luisinfanten/Desktop/IE/Classes/First-Year/Second-Semester/Simulating and Modelling/Models/Notebooks/Residuals/SALESADVMLR2.xlsx')
sales_data.describe(include = "all")

Unnamed: 0,TV,Radio,Newspaper,AdType,Season,Country,Sales
count,200.0,200.0,200.0,200,200,200.0,200.0
unique,,,,4,3,,
top,,,,AdType2,Standard,,
freq,,,,52,86,,
mean,147.0425,23.264,30.554,,,0.495,15.1305
std,85.854236,14.846809,21.778621,,,0.50123,5.283892
min,0.7,0.0,0.3,,,0.0,1.6
25%,74.375,9.975,12.75,,,0.0,11.0
50%,149.75,22.9,25.75,,,0.0,16.0
75%,218.825,36.525,45.1,,,1.0,19.05


In [22]:
# Create dummy variables for categorical predictors
sales_data = pd.get_dummies(sales_data, columns=['AdType', 'Season', 'Country'], drop_first=True)
sales_data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,AdType_AdType2,AdType_AdType3,AdType_AdType4,Season_Standard,Season_Summer,Country_1
0,230.1,37.8,69.2,22.1,False,True,False,False,False,False
1,44.5,39.3,45.1,10.4,True,False,False,True,False,True
2,17.2,45.9,69.3,12.0,False,False,False,True,False,True
3,151.5,41.3,58.5,16.5,False,False,True,False,True,False
4,180.8,10.8,58.4,17.9,False,False,True,False,True,True


In [23]:
# Create the Linear Regression Model Formula
model_formula = "Sales ~ TV + Radio + Newspaper + AdType_AdType2 + AdType_AdType3 + AdType_AdType4 + Season_Standard + Season_Summer + Country_1"
# model_formula = "Sales ~ TV + Radio + Newspaper + " + " + ".join(sales_data.columns[4:])
model_formula

'Sales ~ TV + Radio + Newspaper + AdType_AdType2 + AdType_AdType3 + AdType_AdType4 + Season_Standard + Season_Summer + Country_1'

In [24]:
# Estimate the model with OLS
model = smf.ols(formula=model_formula, data=sales_data).fit()
# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.944
Model:                            OLS   Adj. R-squared:                  0.942
Method:                 Least Squares   F-statistic:                     359.0
Date:                Thu, 25 Apr 2024   Prob (F-statistic):          3.49e-114
Time:                        15:17:20   Log-Likelihood:                -327.15
No. Observations:                 200   AIC:                             674.3
Df Residuals:                     190   BIC:                             707.3
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [25]:
# Calculate the residuals (differences between actual and predicted values)
residuals = model.resid
# Compute the mean of squared residuals
mean_squared_residuals = np.mean(residuals**2)
# Calculate the Residual Standard Error (RSE)
rse = np.sqrt(mean_squared_residuals)
print("Residual Standard Error (RSE):", rse)
print("2s:", rse*2)

Residual Standard Error (RSE): 1.242116521310255
2s: 2.48423304262051


In [26]:
# Create new data
new_data = pd.DataFrame({'TV':list([20]), 'Radio':list([30]), 'Newspaper':list([2]),
                         'AdType_AdType2':list([0]), 'AdType_AdType3':list([1]),
                         'AdType_AdType4':list([0]), 'Season_Standard':list([0]),
                         'Season_Summer':list([0]), 'Country_1':list([0])})
new_data = sm.add_constant(new_data)

# Get prediction results with 95% confidence interval for mean predictions
pred_results = model.get_prediction(new_data).summary_frame(alpha=0.05)

# Print the predicted values and confidence intervals
print('Predicted values:\n', pred_results['mean'])
# Print the prediction interval
print('95% prediction interval:\n', pred_results[['obs_ci_lower', 'obs_ci_upper']].iloc[0])

Predicted values:
 0    15.227213
Name: mean, dtype: float64
95% prediction interval:
 obs_ci_lower    12.399002
obs_ci_upper    18.055423
Name: 0, dtype: float64
