# Mini Project 1: Multiple Linear Regression with Interaction Variable
Author: Manuel Serna-Aguilera

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns # data visualization built on top of matplotlib
import sklearn # use sklearn
import statsmodels # estimates different statistical models

import sklearn.linear_model as skl_lm
import statsmodels.api as sm
import statsmodels.formula.api as smf

## 1. Load Advertising Data
Note: The file ''Advertising.csv'' is located in a folder called ''data''. I added this file and directory manually.

In [2]:
advert_df = pd.read_csv('data/Advertising.csv', usecols=[1, 2, 3, 4])
advert_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   radio      200 non-null    float64
 2   newspaper  200 non-null    float64
 3   sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [3]:
advert_df.head(10)

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
5,8.7,48.9,75.0,7.2
6,57.5,32.8,23.5,11.8
7,120.2,19.6,11.6,13.2
8,8.6,2.1,1.0,4.8
9,199.8,2.6,21.2,10.6


In [4]:
print('Number of samples for TV: {}'.format(len(advert_df.TV)))
print('Number of samples for Radio: {}'.format(len(advert_df.radio)))

Number of samples for TV: 200
Number of samples for Radio: 200


## 2. Use 'smf' toolbox to compute Multiple Linear Regression with Interaction variable.
Refer to specific import with alias `smf` in the import statements cell.

The below formula will be used:
```
sales = beta0 + beta1*TV + beta2*radio + beta3(radio*TV) + e
```
where `e` is irreducible error.

In [5]:
# Create statsmodels model with the specified formula
formula = 'sales ~ TV + radio + (radio*TV)'
smf_model = smf.ols(formula=formula, data=advert_df).fit()

In [6]:
smf_model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.7502,0.248,27.233,0.000,6.261,7.239
TV,0.0191,0.002,12.699,0.000,0.016,0.022
radio,0.0289,0.009,3.241,0.001,0.011,0.046
radio:TV,0.0011,5.24e-05,20.727,0.000,0.001,0.001


## 3. Use sklearn to compute Multiple Linear Regression with Interaction variable.
Goal: Find the coefficients for the features/variables/predictors
1. `Intercept`, 
2. `TV`, 
3. `Radio`, and
4. `Radio*TV`
and compare scikit-learn results with smf.

In [7]:
# Create feature matrix with interaction term
import patsy

data = advert_df[['sales', 'TV', 'radio']]
y, x = patsy.dmatrices('sales ~ TV + radio + TV:radio', data)

In [8]:
model = sklearn.linear_model.LinearRegression()

In [9]:
model.fit(x, y)

LinearRegression()

## 4. Compare 'smf' and scikit-learn coefficients

In [10]:
# Grab smf coefficients
smf_coefs = [x for x in smf_model.params]

In [11]:
# Grab sk-learn coefficients
skl_coefs = [model.intercept_[0]] + [x for x in model.coef_[0,1:]]

In [12]:
assert len(smf_coefs) == len(skl_coefs)

In [14]:
print('Intercepts: smf={:.5f} sk-learn={:.5f}'.format(smf_coefs[0], skl_coefs[0]))
print('TV:         smf={:.5f} sk-learn={:.5f}'.format(smf_coefs[1], skl_coefs[1]))
print('Radio:      smf={:.5f} sk-learn={:.5f}'.format(smf_coefs[2], skl_coefs[2]))
print('TV*Radio:   smf={:.5f} sk-learn={:.5f}'.format(smf_coefs[3], skl_coefs[3]))

Intercepts: smf=6.75022 sk-learn=6.75022
TV:         smf=0.01910 sk-learn=0.01910
Radio:      smf=0.02886 sk-learn=0.02886
TV*Radio:   smf=0.00109 sk-learn=0.00109


The values computed using sklearn appear to be the same as those computed using 'smf', at least up to several significant digits to the right of the decimal point (up to what eight bytes for a float allows).