# Linear Regression

**Scenario (Marketing)**: Predict daily revenue from ad spend, promotions, price index, weather, and weekend effects for a digital storefront.
We will fit a baseline linear regression, evaluate with time-based split, and visualize predictions.

In [None]:
# Setup
import pandas as pd, numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Regression_Forecasting/reg_for_utils.py
import reg_for_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Regression_Forecasting/marketing_daily.csv"

df = pd.read_csv(csv_path, parse_dates=["date"]).sort_values("date")
df.head()

## Train/Test split (time-aware)
We'll reserve the final 90 days as a test set to simulate forward-looking evaluation.

In [None]:
train, test = utils.time_train_test_split(df, "date", test_days=90)
features = ["search_spend","social_spend","display_spend","promo","price_index","temp_F","rain","is_weekend"]
target = "revenue"

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

numeric = ["search_spend","social_spend","display_spend","price_index","temp_F"]
categorical = ["promo","rain","is_weekend"]

pre = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(drop="if_binary"), categorical)
])

pipe = Pipeline([("pre", pre), ("lr", LinearRegression())])
pipe.fit(X_train, y_train)

## Evaluation

In [None]:
y_pred = pipe.predict(X_test)
print("RMSE:", utils.rmse(y_test, y_pred))
print("MAE :", utils.mae(y_test, y_pred))
print("R^2 :", utils.r2(y_test, y_pred))

utils.plot_series(test["date"], y_test, y_pred, title="Revenue — actual vs. predicted (test)")

## Inspect coefficients (plain-language sense check)
Note: scaled coefficients reflect relative influence after standardization.

In [None]:
lr = pipe.named_steps["lr"]
coefs = lr.coef_
feat_names = list(pipe.named_steps["pre"].get_feature_names_out())
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs}).sort_values("coef", key=abs, ascending=False)
coef_df.head(10)