In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("../Data/Advertising.csv", index_col=0)
df.head()


Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [5]:
number_of_samples, number_of_features = df.shape[0], df.shape[1]-1 # -1 because sales is label, not a feature

number_of_samples, number_of_features




(200, 3)

In [6]:
X, y = df.drop("Sales", axis="columns"), df["Sales"]

In [7]:
X.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [8]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## Sklearn - Typical steps

1. train|test split, sometimes train|val|test split
2. scaling sometimes required
   - min--max scaling
   - standardization
   - ... (other methods exists)
   - scale the training data
   - scale test data to the training data --> avoiding data leakage
3. Fit algorithm to training data - model training
4. Predict test data
5. Evaluate

## Train|test split

In [10]:
from sklearn.model_selection import train_test_split

# help(train_test_split) -> find order of train|test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape



((140, 3), (60, 3), (140,), (60,))

## Feature scaling

Normalization (min-max feature scaling)
- $X' = \frac{X-X_{\min}}{X_{max}-X_{min}}$
(elementwise operation)

Feature standardization
- $X' = \frac{X-\mu}{\sigma}$
- 

In [15]:
from sklearn.preprocessing import MinMaxScaler

# instantiate a scaler instance
scaler = MinMaxScaler()
scaler.fit(X_train) # important - use this for training data

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min()=}")
print(f"{scaled_X_train.max()=}")
print(f"{scaled_X_test.min()=}")
print(f"{scaled_X_test.max()=}")
# note scaled_X_test.min !=0, scaled_X_test.max != 1
# scaled_X_train 


scaled_X_train.min()=0.0
scaled_X_train.max()=1.0
scaled_X_test.min()=0.005964214711729622
scaled_X_test.max()=1.1302186878727631


In [16]:
scaled_X_train.shape, scaled_X_test.shape

((140, 3), (60, 3))

## Linear regression

### Ordinary Least squares

In [18]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train) # fit on training data
print(f"Parameters {model_OLS.coef_}") # beta_1, beta_2, beta_3
print(f"Intercept: {model_OLS.intercept_}") # beta_0

Parameters [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124


### Stochastic gradient descent

In [22]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss="squared_error", max_iter=1000)
model_SGD.fit(scaled_X_train, y_train)
print(f"Parameters {model_SGD.coef_}") # beta_1, beta_2, beta_3
print(f"Intercept: {model_SGD.intercept_}") # beta_0

Parameters [11.99760348  9.02465327  1.32733857]
Intercept: [3.55102937]


## Manual prediction

In [31]:
test_sample_features = scaled_X_test[0].reshape(1, -1)
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [32]:
model_OLS.predict(test_sample_features)

array([16.5653963])

In [33]:
model_SGD.predict(test_sample_features)

array([16.59188469])

In [27]:
X_test.iloc[0].to_numpy()

array([163.3,  31.6,  52.9])

In [34]:
scaled_X_test

array([[0.54988164, 0.63709677, 0.52286282],
       [0.65843761, 0.96169355, 0.52286282],
       [0.98816368, 0.57056452, 0.42644135],
       [0.03719986, 0.74395161, 0.44632207],
       [0.74264457, 0.98790323, 0.02882704],
       [0.25160636, 0.70564516, 0.52087475],
       [0.73080825, 0.88508065, 0.26739563],
       [0.16672303, 0.23387097, 0.17992048],
       [0.74974636, 0.06854839, 0.12723658],
       [0.58978695, 0.45362903, 0.31013917],
       [0.10415962, 0.49596774, 0.01888668],
       [0.18769023, 0.11491935, 0.29224652],
       [0.79066622, 0.06854839, 0.83996024],
       [0.01589449, 0.60282258, 0.09045726],
       [0.46939466, 0.04233871, 0.26143141],
       [0.5732161 , 0.15725806, 0.34691849],
       [0.02231992, 0.56653226, 0.40854871],
       [0.66587758, 0.46975806, 0.13817097],
       [0.25228272, 0.40927419, 0.32007952],
       [0.80047345, 0.55443548, 0.10636183],
       [0.77375719, 0.65120968, 0.73459245],
       [0.22691917, 0.73790323, 1.13021869],
       [0.

## Evaluation

In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. predict on test data
y_pred_OLS = model_OLS.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)
y_pred_OLS[:5]

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988])

In [41]:
y_pred_SGD[:5]

array([16.59188469, 20.8236695 , 21.12180435, 11.30366445, 21.41473179])

In [42]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [44]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mse_OLS = mean_squared_error(y_test, y_pred_OLS)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_OLS = np.sqrt(mse_OLS)
rmse_SGD = np.sqrt(mse_SGD)

print(f"{mae_OLS=} \t {mse_OLS=} \t {rmse_OLS=}")
print(f"{mae_SGD=} \t {mse_SGD=} \t {rmse_SGD=}")

mae_OLS=1.511669222454909 	 mse_OLS=3.7967972367152223 	 rmse_OLS=1.9485372043446392
mae_SGD=1.5217287901877827 	 mse_SGD=4.0724854130161665 	 rmse_SGD=2.018039992917922
