In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import yfinance as yf

import plotly.express as px
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error

# Model building (Linear Regression)

This notebook trains a scikit-learn Linear Regression model to predict next-day close (t+1) for NVIDIA (NVDA) using the cleaned NASDAQ dataset from the pipeline.

## What this notebook does
1) Load the cleaned daily NASDAQ dataset (`df_nasdaq_daily.csv`)
2) Filter to NVDA and the date window 2015–2025
3) Build time-series features (lags + simple technical stats)
4) Split by time (no shuffling):
   - Train: 2015–2023
   - Validation: 2024 (feature-engineering selection)
   - Test: 2025 (final evaluation)
5) Train a final model and evaluate it with MAPE
6) Produce two forecasting modes for 2026-02-02:
   - **(a) Ex-post** (best case): uses real history up to the last trading day before the target (via yfinance extension)
   - **(b) Autoregressive** (realistic): only uses info available at 2026-01-18, then feeds predictions back in

## Outputs / checks
- **Validation selection**: choose the lookback window with lowest 2024 MAPE
- **Test evaluation** (unseen 2025): report MAPE, plot true vs predicted
- **Model form**: show the linear regression functional form
- **Forward forecast**: compare predicted vs true close on 2026-02-02 and discuss limitations

## Data
Input file: `../data/processed/df_nasdaq_daily.csv` with columns:
- `ticker`, `date`, `close`

In [None]:
# Config
DATA_DIR = Path("../data")
NASDAQ_DAILY_FILE = DATA_DIR / "processed" / "df_nasdaq_daily.csv"

TICKER = "NVDA"

START = pd.Timestamp("2015-01-01")
TRAIN_END = pd.Timestamp("2023-12-31")
VAL_END = pd.Timestamp("2024-12-31")
TEST_END = pd.Timestamp("2025-12-31")

# feature engineering search space (validated on 2024)
CANDIDATE_LOOKBACKS = [5, 10, 20, 30]

TARGET_DAY = pd.Timestamp("2026-02-02")
CUTOFF_DAY = pd.Timestamp("2026-01-18")

In [None]:
# Load NVDA (processed NASDAQ daily)
df = pd.read_csv(NASDAQ_DAILY_FILE, keep_default_na=False)
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
df["ticker"] = df["ticker"].astype(str).str.strip()
df["close"] = pd.to_numeric(df["close"], errors="coerce")

df = df.dropna(subset=["date", "ticker", "close"])
df = df[(df["ticker"] == TICKER) & (df["date"].between(START, TEST_END))]
df = df.sort_values("date").reset_index(drop=True)

df.head()

In [None]:
def make_features_next_close(df_prices: pd.DataFrame, lookback: int) -> tuple[pd.DataFrame, list[str]]:
    """Build features at day t to predict close(t+1), using only close-based features."""
    x = df_prices.sort_values("date").reset_index(drop=True).copy()

    # Target: next day's close
    x["target_date"] = x["date"].shift(-1)
    x["y"] = x["close"].shift(-1)

    # Close lags: c0=close_t, c1=close_{t-1}, ...
    for i in range(lookback):
        x[f"c{i}"] = x["close"].shift(i)

    # Derived features (end at t)
    x["ret_1"] = x["close"].pct_change(1, fill_method=None)
    x["ret_5"] = x["close"].pct_change(5, fill_method=None)
    x["ma_3"] = x["close"].rolling(3).mean()
    x["ma_L"] = x["close"].rolling(lookback).mean()
    x["std_L"] = x["close"].rolling(lookback).std()

    feat_cols = [f"c{i}" for i in range(lookback)] + ["ret_1", "ret_5", "ma_3", "ma_L", "std_L"]
    out = x[["date", "target_date", "y"] + feat_cols].dropna().reset_index(drop=True)
    return out, feat_cols


def split_by_target_date(d: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train = d[d["target_date"].between(START, TRAIN_END)].copy()
    val = d[d["target_date"].between(TRAIN_END + pd.Timedelta(days=1), VAL_END)].copy()
    test = d[d["target_date"].between(VAL_END + pd.Timedelta(days=1), TEST_END)].copy()
    return train, val, test

## Feature engineering (validated on 2024)

We create a small family of feature sets by varying the lookback window.
The best lookback is chosen based on the lowest MAPE on the 2024 validation split.

In [None]:
best_L = None
best_mape_val = np.inf

for L in CANDIDATE_LOOKBACKS:
    d, feat_cols = make_features_next_close(df, lookback=L)
    train, val, _ = split_by_target_date(d)

    model = LinearRegression().fit(train[feat_cols], train["y"])
    val_pred = model.predict(val[feat_cols])
    mape = mean_absolute_percentage_error(val["y"], val_pred)

    if mape < best_mape_val:
        best_mape_val = mape
        best_L = L

best_L, best_mape_val

## Evaluation step 1: test performance (2025)

We train the final linear regression model on train + validation (2015–2024) and evaluate it on the unseen test set (2025).

Metric: Mean Absolute Percentage Error (MAPE).

In [None]:
d, feat_cols = make_features_next_close(df, lookback=best_L)
train, val, test = split_by_target_date(d)

train_val = pd.concat([train, val], ignore_index=True)

model = LinearRegression()
model.fit(train_val[feat_cols].to_numpy(), train_val["y"].to_numpy())

y_test = test["y"].to_numpy()
y_test_pred = model.predict(test[feat_cols].to_numpy())
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

print("Selected lookback:", best_L)
print("Test MAPE (2025):", mape_test)

### Test set visualization
Predicted vs true close prices for the 2025 test split.

In [None]:
plot_df = pd.DataFrame(
    {
        "date": test["target_date"],
        "true_close": y_test,
        "pred_close": y_test_pred,
    }
)

fig = px.line(
    plot_df,
    x="date",
    y=["true_close", "pred_close"],
    title=f"{TICKER} — Test set (2025): true vs predicted close",
    labels={"value": "close", "variable": "", "date": "date"},
)
fig.update_layout(legend_title_text="")
fig.show()

### Functional form of linear regression

Linear regression models a linear relationship between the feature vector $x$ and the prediction $\hat{y}$:

$$
\hat{y} = \beta_0 + \sum_{i=1}^{p} \beta_i x_i
$$

where $\beta_0$ is the intercept and $\beta_i$ are the learned coefficients.

In [None]:
print("y_hat = intercept + Σ coef_i * x_i")
print("intercept:", model.intercept_)
print("n_features:", len(feat_cols))

In [None]:
def yf_nvda_daily(start: str, end: str) -> pd.DataFrame:
    data = yf.download(
        TICKER,
        start=start,
        end=(pd.Timestamp(end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
        auto_adjust=True,
        progress=False,
        threads=False,
        group_by="column",
        actions=False,
    )

    if data is None or data.empty:
        return pd.DataFrame(columns=["date", "close"])

    data.columns = data.columns.get_level_values(0)

    data = data.reset_index().rename(columns={"Date": "date", "Close": "close"})
    data["date"] = pd.to_datetime(data["date"]).dt.tz_localize(None)
    data["close"] = pd.to_numeric(data["close"], errors="coerce")

    return data[["date", "close"]].dropna().sort_values("date").reset_index(drop=True)

# Train a final model on the full 2015–2025 history (for the step-2 forecasts)
d_all, feat_cols_all = make_features_next_close(df, lookback=best_L)
train_all = d_all[d_all["target_date"].between(START, TEST_END)].copy()

model_all = LinearRegression()
model_all.fit(train_all[feat_cols_all].to_numpy(), train_all["y"].to_numpy())

## Evaluation step 2(a): ex-post prediction

We predict the close on 2026-02-02 using the real price history up to the last trading day before that date.

To ensure we have the required history after 2025-12-31, we fill the missing period 2026-01-01 ... 2026-02-02 via yfinance.

In [None]:
# Extend history using yfinance
yf_ext = yf_nvda_daily((pd.Timestamp(TEST_END) + pd.Timedelta(days=1)).strftime("%Y-%m-%d"), TARGET_DAY)

df_hist = df.loc[df["date"].between(START, TEST_END), ["date", "close"]].copy()
df_combo = pd.concat([df_hist, yf_ext.loc[yf_ext["date"] > TEST_END]], ignore_index=True)
df_combo = df_combo.drop_duplicates("date").sort_values("date").reset_index(drop=True)

# Build features + select evaluation day (fallback if target not available)
d_combo, feat_cols_combo = make_features_next_close(df_combo, lookback=best_L)
eval_day = TARGET_DAY if (df_combo["date"] == TARGET_DAY).any() else df_combo.loc[df_combo["date"] <= TARGET_DAY, "date"].max()

row = d_combo.loc[d_combo["target_date"] == eval_day]
pred_expost = float(model_all.predict(row[feat_cols_combo].to_numpy())[0])

true_close = float(yf_ext.loc[yf_ext["date"] == eval_day, "close"].iloc[0])
rel_err = abs(pred_expost - true_close) / true_close

print("Ex-post prediction")
print("requested:", TARGET_DAY.date(), "| used:", eval_day.date())
print("pred:", pred_expost, "| true:", true_close, "| rel_err:", float(rel_err))

## Evaluation step 2(b): autoregressive forward forecast

We predict the close on 2026-02-02 using only information available at 2026-01-18.

From the cutoff date onward, each next-day close is predicted and then fed back into the feature history.
This simulates a real-world setting where future closes are unknown, which leads to error propagation.

In [None]:
def feature_vector_from_closes(closes: np.ndarray, lookback: int) -> np.ndarray:
    """Build one feature vector to predict next close given close history up to time t."""
    c = closes[::-1]  # newest first

    feats = list(c[:lookback])
    ret_1 = c[0] / c[1] - 1.0
    ret_5 = c[0] / c[5] - 1.0
    ma_3 = float(np.mean(c[:3]))
    ma_L = float(np.mean(c[:lookback]))
    std_L = float(np.std(c[:lookback], ddof=1))

    feats.extend([ret_1, ret_5, ma_3, ma_L, std_L])
    return np.array(feats, dtype=float)

In [None]:
CUTOFF_DAY =  pd.Timestamp("2025-12-01")
TARGET_DAY = pd.Timestamp("2026-01-16")

last_known = df_combo.loc[df_combo["date"] <= CUTOFF_DAY, "date"].max()
future_dates = df_combo.loc[(df_combo["date"] > last_known) & (df_combo["date"] <= TARGET_DAY), "date"].tolist()
known_closes = df_combo.loc[df_combo["date"] <= last_known, "close"].to_numpy().astype(float)

pred_rows = []
for dt in future_dates:
    x_vec = feature_vector_from_closes(known_closes, lookback=best_L)
    y_pred = float(model_all.predict(x_vec.reshape(1, -1))[0])
    pred_rows.append({"date": dt, "pred_close": y_pred})
    known_closes = np.append(known_closes, y_pred)

pred_df = pd.DataFrame(pred_rows)
true_df = df_combo.loc[df_combo["date"].isin(pred_df["date"]), ["date", "close"]].rename(columns={"close": "true_close"})
cmp = pred_df.merge(true_df, on="date", how="left")

used_day = cmp.loc[cmp["date"] <= TARGET_DAY, "date"].max()

pred_target = float(cmp.loc[cmp["date"] == used_day, "pred_close"].iloc[0])
true_target = float(cmp.loc[cmp["date"] == used_day, "true_close"].iloc[0])
rel_err = abs(pred_target - true_target) / true_target

print("Autoregressive prediction")
print("cutoff:", CUTOFF_DAY.date(), "| last known trading day:", last_known.date())
print("requested target:", TARGET_DAY.date(), "| used:", used_day.date())
print("pred:", pred_target, "| true:", true_target, "| rel_err:", float(rel_err))

### Autoregressive forecast plot
Predicted vs true close values over the forecast horizon.

In [None]:
fig = px.line(
    cmp,
    x="date",
    y=["true_close", "pred_close"],
    title=f"{TICKER} — Autoregressive forecast (from {last_known.date()} to {used_day.date()})",
    labels={"value": "close", "variable": "", "date": "date"},
)
fig.update_layout(legend_title_text="")
fig.show()

## Critical limitations of linear regression (discussion)

- **Linearity constraint:** the model can only represent linear relationships between features and the target.
- **Missing exogenous drivers:** price movements are often driven by news, earnings, macro conditions, and sentiment, which are not included here.
- **Error propagation in autoregressive mode:** multi-step forecasts can both amplify errors and collapse toward a fixed point once predictions are fed back in, return/volatility features tend toward ~0 and lag/MA features become self-consistent, so the model quickly produces a smooth, near-constant path that drifts away from real-market variability.
- **Non-stationarity / concept drift:** patterns from 2015–2023 may not hold in 2026.