In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Use a safe loader so the notebook can still run in environments missing the precomputed CSVs
DATA_DIR = Path('data/notebooks')

def safe_read_csv(path, **kwargs):
    path = Path(path)
    if not path.exists():
        print(f'[Warning] missing file: {path} -- some cells will be skipped or show placeholders')
        return None
    try:
        return pd.read_csv(path, **kwargs)
    except Exception as e:
        print(f'[Error] failed to read {path}: {e}')
        return None

metrics = safe_read_csv(DATA_DIR / 'metrics_all_symbols.csv')
best_models = safe_read_csv(DATA_DIR / 'best_model_per_symbol.csv')
rmse_pivot = safe_read_csv(DATA_DIR / 'rmse_pivot_table.csv')
returns_stats = safe_read_csv(DATA_DIR / 'returns_summary_stats.csv', index_col=0)

# derive a minimal UNIVERSE list if metrics available; otherwise leave empty
if metrics is not None and 'Symbol' in metrics.columns:
    UNIVERSE = metrics['Symbol'].unique().tolist()
else:
    UNIVERSE = []

print(f'Loaded metrics: {metrics is not None}, best_models: {best_models is not None}, rmse_pivot: {rmse_pivot is not None}, returns_stats: {returns_stats is not None}')

In [None]:
from IPython.display import Image, display
from pathlib import Path

# Only display images if they exist to keep notebook runnable in minimal environments
image_files = [
    DATA_DIR / 'universe_normalized_prices.png',
    DATA_DIR / 'corr_heatmap.png',
    DATA_DIR / 'fig_AAPL_test_models.png',
    DATA_DIR / 'fig_SPY_test_models.png',
    DATA_DIR / 'fig_QQQ_test_models.png',
]

for p in image_files:
    if Path(p).exists():
        display(Image(filename=str(p)))
    else:
        print(f'[Warning] image not found: {p}')

In [None]:
rmse_cols = ["LSTM", "MA 20 day", "MA 5 day", "Naive lag1"]

if rmse_pivot is None:
    print('[Warning] rmse_pivot_table.csv not loaded; skipping winner count computation')
    winner_counts = None
else:
    winner_counts = (
        rmse_pivot.set_index('Symbol')[rmse_cols].idxmin(axis=1).value_counts()
    )

winner_counts

In [None]:
if returns_stats is None:
    print('[Warning] returns_summary_stats.csv not loaded; skipping returns table')
    returns_sorted = None
else:
    returns_sorted = returns_stats.sort_values('std', ascending=False)
    returns_sorted.head(10)

# Neural Networks for Daily Stock Prediction: An Empirical Study with LSTM and Classical Benchmarks

## Abstract

This project investigates whether a simple recurrent neural network architecture can meaningfully improve daily stock price forecasts relative to classical time series baselines.

I construct a liquid universe of large capitalisation United States equities and major equity indices from 2010 to 2025 using Yahoo Finance adjusted close data. For each symbol I train a univariate Long Short Term Memory (LSTM) network on the historical price series and compare its test set performance with three standard benchmarks: a lag one naive model, a five day moving average, and a twenty day moving average.

Models are evaluated out of sample using mean absolute error and root mean squared error on a recent test window. Even though the LSTM is able to fit the training data, the empirical results show that the naive lag one model consistently achieves the lowest test error across all symbols in the universe. The moving average models perform slightly worse than the naive model, and the LSTM rarely outperforms either baseline.

These findings are consistent with the view that daily prices of liquid assets behave close to a random walk, so that the best forecast of tomorrow is simply today. The study highlights the importance of strong baselines and careful validation when applying deep learning to financial markets. I conclude with a discussion of potential extensions, including directional accuracy, simple trading rules based on model forecasts, and richer feature sets that incorporate volatility, technical indicators and macro factors.


## 2. Introduction and motivation

Machine learning and deep learning have become very popular in quantitative finance. Recurrent neural networks, and in particular Long Short Term Memory (LSTM) networks, are often proposed as a way to exploit temporal patterns in stock prices. Many introductory papers and online tutorials claim that these models can "predict" the stock market more accurately than simple statistical methods.

At the same time, classical financial theory states that liquid equity prices follow a process close to a random walk. In such a setting, the conditional expectation of tomorrow price given today information is essentially equal to today price. Under squared error loss the optimal forecast of \(P_{t+1}\) is simply \(P_t\). This leads to an extremely simple benchmark model, often called the naive or random walk model, that sets

\[
\hat P_{t+1}^{\text{naive}} = P_t.
\]

If this description is accurate, then complex neural networks that see only the same price history should not be able to consistently beat the naive model on purely statistical error metrics.

The main objective of this project is to test this claim empirically on a realistic universe of large liquid stocks and indices. The research question is:

> Given only the past daily prices of a stock, can an LSTM significantly outperform a naive lag one model and simple moving average baselines on out of sample prediction error?

I focus on a univariate setting where each model sees only the history of a single asset. This keeps the methodology transparent and makes it easier to relate the results to classical time series theory. At the same time it provides a useful building block for more advanced multi asset, cross sectional or factor based models in future work.

## 3. Data and universe construction

### 3.1 Universe selection

The universe consists of 40 large capitalisation equities and several major equity indices and exchange traded funds. The list includes technology names such as AAPL, MSFT, NVDA and AMZN, consumer names such as WMT and MCD, financials such as JPM and GS, energy majors XOM and CVX, and broad market indices such as SPY, QQQ and DIA.

These assets were chosen to represent liquid names with long price histories, diverse sector exposures and a range of volatility levels. Many of them are constituents of the S and P 500 or Nasdaq indices and are widely traded by institutional and retail investors.

### 3.2 Data source and frequency

For each symbol I download daily price history from Yahoo Finance using the `yfinance` Python library. I focus on adjusted close prices which incorporate corporate actions such as splits and dividends. The sample runs from 4 January 2010 to early December 2025, yielding roughly 4 000 trading days for the earliest symbols.

From the adjusted close series I construct daily log returns

\[
r_t = \log P_t - \log P_{t-1}.
\]

In the forecasting pipeline the models operate on a scaled version of either prices or returns, but the evaluation metrics are always computed in the original scale so that errors are comparable across models.

### 3.3 Universe behaviour

The figure below shows each symbol normalised to start at 1 at the beginning of the sample:

*(insert `universe_normalized_prices.png` here)*

The plot highlights the very strong growth of several technology names such as NVDA, AVGO and TSLA relative to the rest of the universe. It also shows periods of broad market drawdowns such as the Covid shock in 2020 and the inflation and rate hiking cycle in 2022.

To understand cross sectional dependence, I compute the correlation matrix of daily returns across all symbols. The heatmap is shown below:

*(insert `corr_heatmap.png` here)*

Unsurprisingly, broad market indices such as SPY, QQQ, DIA, and the level indices ^GSPC, ^NDX and ^DJI are highly correlated with each other and with large benchmark constituents. Sector pairs such as XOM and CVX, or V and MA, also display strong positive correlation. This structure will matter in future work when I extend the models to use cross sectional information, but in the current project each asset is modelled independently.


### 3.4 Return summary statistics

To get a sense of risk and reward, I compute the mean and standard deviation of daily returns for each symbol. Volatile names such as NVDA, TSLA and NFLX exhibit daily return standard deviations above 3 percent, while stable consumer names and indices have much lower values around 1 percent.

This confirms that the universe contains a mix of high beta growth stocks and defensive names. It also suggests that forecasting error magnitudes will be larger in absolute terms for the more volatile symbols, which is visible later in the RMSE tables.

## 4. Forecasting models

I consider four forecasting models that all operate on the same univariate price series of each asset.

### 4.1 Forecast target

Let \(y_t\) denote the transformed series that the models see, which can be either log price or scaled return. The goal is to forecast \(y_{t+1}\) one step ahead using information available up to time \(t\).

Given a history \(y_{t-k+1}, \dots, y_t\), each model outputs a prediction \(\hat y_{t+1}\). Performance is evaluated on a held out test set using mean absolute error and root mean squared error.

### 4.2 Naive lag one model

The naive model uses the simplest possible rule:

\[
\hat y_{t+1}^{\text{naive}} = y_t.
\]

This corresponds to assuming that the best guess for tomorrow is simply today, which is optimal for a random walk with independent increments under squared loss.

### 4.3 Moving average benchmarks

The second and third models are simple moving averages of the recent history:

- Five day moving average

\[
\hat y_{t+1}^{\text{MA5}} = \frac{1}{5} \sum_{i=0}^{4} y_{t-i}.
\]

- Twenty day moving average

\[
\hat y_{t+1}^{\text{MA20}} = \frac{1}{20} \sum_{i=0}^{19} y_{t-i}.
\]

These models smooth short term noise and are widely used in technical analysis as trend filters. However, the smoothing introduces lag, so they may react more slowly to turning points.

### 4.4 LSTM neural network

The final model is a univariate Long Short Term Memory network. For each asset I construct overlapping windows of length \(L\), for example 60 trading days, and use these as inputs to an LSTM that outputs a one step ahead forecast.

An LSTM cell maintains an internal hidden state \(h_t\) and a cell state \(c_t\) that are updated at each time step through a series of gating operations. At a high level, the equations are

\[
\begin{aligned}
i_t &= \sigma(W_i x_t + U_i h_{t-1} + b_i) \\
f_t &= \sigma(W_f x_t + U_f h_{t-1} + b_f) \\
o_t &= \sigma(W_o x_t + U_o h_{t-1} + b_o) \\
\tilde c_t &= \tanh(W_c x_t + U_c h_{t-1} + b_c) \\
c_t &= f_t \odot c_{t-1} + i_t \odot \tilde c_t \\
h_t &= o_t \odot \tanh(c_t),
\end{aligned}
\]

where \(i_t, f_t, o_t\) are the input, forget and output gates and \(\sigma\) is the logistic function. This architecture allows the network to retain or forget information across long time horizons, which is important in many sequence tasks.

In this project I use a single layer LSTM followed by a fully connected layer that maps the final hidden state to a scalar output \(\hat y_{t+1}\). The network is trained with mean squared error loss and the Adam optimiser on the training set of each symbol. Early stopping based on validation loss is used to avoid overfitting.

## 5. Experimental setup

For each symbol the full history is split into a training period and a test period. The training set covers roughly the first eighty percent of observations, while the last twenty percent is held out for evaluation. This corresponds to training mainly on data from 2010 to mid 2022 and testing on the more recent regime from 2022 onwards.

The baselines and LSTM all receive the same training and test splits for each symbol. For the moving average models the only hyper parameter is the window length. For the LSTM the main hyper parameters are the sequence length, hidden size, learning rate and number of training epochs. I keep these constant across symbols for simplicity and to avoid an unfair advantage from extensive tuning.

Performance is measured with:

- Mean absolute error (MAE)

\[
\text{MAE} = \frac{1}{N} \sum_{t=1}^{N} \lvert y_t - \hat y_t \rvert
\]

- Root mean squared error (RMSE)

\[
\text{RMSE} = \sqrt{ \frac{1}{N} \sum_{t=1}^{N} (y_t - \hat y_t)^2 }.
\]

The RMSE is more sensitive to large errors and therefore highlights whether a model occasionally makes very poor predictions.

All metrics are computed only on the test period, which the models never see during training, so they provide an honest assessment of generalisation.

## 6. Empirical results

### 6.1 Universe behaviour

The normalised price plot and return summary statistics confirm that the universe exhibits the usual equity characteristics: long run upward drift with significant drawdowns and a wide dispersion of volatility across names. High growth technology stocks such as NVDA, TSLA and AVGO achieved very large cumulative gains but also display higher daily volatility, while indices and consumer staples are more stable.

### 6.2 Error metrics across models

The table `metrics_all_symbols.csv` aggregates MAE and RMSE for each symbol and model. For ease of comparison I pivot the RMSE values into `rmse_pivot_table.csv` and compute which model achieves the lowest RMSE for each symbol.

Across all 43 symbols in the universe the winner is always the naive lag one model. The five day moving average is usually the second best model, followed by the LSTM, and the twenty day moving average is the weakest. This pattern holds for both RMSE and MAE.

In other words, in this experiment the sophisticated neural network never manages to beat a forecast that simply copies yesterday value.

The magnitude of the differences is small in absolute terms because all models are effectively tracking a highly persistent series. For example, for SPY the naive model has RMSE around \(3.6 \times 10^{-5}\) while the LSTM has RMSE around \(6.3 \times 10^{-5}\). However, the ranking is consistent across assets.

### 6.3 Case study: large indices

The figures below illustrate the behaviour for SPY, QQQ and DIA on the test set:

*(insert `fig_SPY_test_models.png`, `fig_QQQ_test_models.png`, `fig_AAPL_test_models.png` or similar)*

The actual line and all forecast lines sit almost on top of each other. Lag one and the moving averages react very quickly to price movements, while the LSTM output is slightly smoother. The twenty day average lags turning points the most, which explains its higher RMSE.

Visually there is no obvious advantage to the LSTM. The predictions are essentially a smoothed version of the recent price history that does not reduce error relative to lag one.

### 6.4 Case study: high volatility names

For high volatility stocks such as NVDA and AVGO the RMSE values are larger in absolute terms, but the ranking of models is the same. Interestingly, the LSTM sometimes exhibits much higher RMSE than the naive model. For example, for NVDA the LSTM RMSE is about 0.024, compared with 0.0028 for the naive model.

This suggests that the neural network occasionally extrapolates too aggressively during sharp swings, while the naive model simply stays anchored to the most recent observation.

Overall, the empirical evidence in this dataset strongly supports the view that, when only past prices of a single asset are used, the random walk model is very difficult to beat in terms of point forecast accuracy.

## 7. Discussion

The main finding of this project is that an LSTM trained on univariate daily price history does not outperform a simple lag one naive model on out of sample prediction error. This is consistent with both classical financial theory and the empirical experience of many practitioners, even though it sometimes contradicts the impression given by online tutorials.

There are several reasons for this outcome.

First, liquid equity prices are highly persistent and approximately follow a random walk with drift. In such a process the conditional expectation of tomorrow price is essentially equal to today price. Under squared error loss this makes the naive model asymptotically optimal. Any nonlinear model that tries to exploit subtle patterns in the noise will on average hurt performance.

Second, the LSTM is a relatively complex model with many parameters compared with the effective signal present in the data. With limited training data and strong autocorrelation, the network can easily overfit idiosyncratic fluctuations in the training period and then generalise poorly to the test period.

Third, the LSTM in this setup receives only information about a single asset. It cannot exploit cross sectional signals such as relative strength or sector momentum, nor can it condition on macro variables or risk factors. In contrast, many successful applications of deep learning in finance make use of rich sets of predictors rather than a single raw price series.

Fourth, forecasting daily prices one step ahead is an extremely hard task from a statistical perspective. Many profitable trading strategies focus instead on predicting volatility, tail risk, regime changes, or cross sectional ranking of returns, where persistence and structure are often stronger.

Finally, the results highlight the importance of strong baselines. Without including the naive lag one model and moving average benchmarks, it would be easy to convince oneself that an LSTM with a small RMSE is doing something intelligent. Once the naive model is included, it becomes clear that the neural network mainly replicates the behaviour of a random walk.

## 8. Trading oriented extensions

The present study focuses on statistical forecast accuracy of the level or return series. From a trading perspective, more interesting questions are related to direction, risk adjusted performance and turnover.

Two natural extensions are:

1. **Directional accuracy**

   Instead of evaluating the exact value of \(\hat y_{t+1}\), we can look at the sign. For each model define a directional forecast

   \[
   \hat s_{t+1} = \text{sign}(\hat y_{t+1}).
   \]

   We can then compute the hit rate

   \[
   \text{Hit rate} = \frac{1}{N} \sum_{t=1}^{N} \mathbf 1\{\hat s_{t+1} = \text{sign}(y_{t+1})\}.
   \]

   A hit rate significantly above 50 percent would be interesting even if RMSE differences are small.

2. **Simple trading strategy**

   Given model predictions we can construct a toy trading rule. For example, for each symbol:

   - Go long one unit if the model predicts positive return
   - Go to cash if the model predicts negative return

   The strategy return on day \(t+1\) is

   \[
   R_{t+1}^{\text{strategy}} = \hat s_{t+1} \cdot r_{t+1}.
   \]

   We can cumulate these returns over the test period and compare the equity curves, Sharpe ratios and maximum drawdowns across models.

These extensions move the evaluation closer to what a practitioner cares about. However, transaction costs, slippage and position sizing would need to be incorporated before drawing strong conclusions about profitability.

For this project I keep the main focus on forecast error, but the trading extension is an attractive direction for future work and can be implemented inside the same pipeline using the saved predictions.

In [None]:
# example sketch, depends on how you saved predictions
preds = safe_read_csv(DATA_DIR / 'predictions_example.csv')  # columns: date, symbol, model, y_true, y_pred

def strategy_pnl(df, model_name):
    sub = df[df["Model"] == model_name].copy()
    sub["side"] = (sub["y_pred"] > 0).astype(int) * 2 - 1   # +1 or -1
    sub["pnl"] = sub["side"] * sub["y_true"]
    return sub["pnl"].cumsum()

if preds is None:
    print('[Warning] predictions_example.csv not found; skipping example sketch')
# then plot for AAPL or SPY

## 8. Streamlit user interface

To make the models easier to explore I built a lightweight web application using Streamlit. The app wraps the same data and modelling pipeline in a simple graphical interface so that a user can select a symbol, choose a forecasting model and visualise the results without writing code.

### 8.1 Design and architecture

The app is implemented in `app/streamlit_app.py`. It relies on three main components:

1. **Data layer**  
   Historical prices for each symbol are loaded through the same helper functions used in the offline experiments. To avoid repeated downloads and improve responsiveness, the app uses Streamlit caching so that data for a given symbol is fetched only once per session.

2. **Model layer**  
   For the baseline models, the app recomputes naive and moving average forecasts on the fly. For the LSTM, the app loads a pre trained model for each symbol and generates predictions on the selected test window. This keeps the online computation lightweight because the neural network does not need to be re trained when the user changes settings.

3. **Presentation layer**  
   The user selects a symbol, date range and model through the sidebar. The main panel displays:
   * A time series chart of actual prices overlaid with the model forecast  
   * Basic error metrics such as MAE and RMSE for the current selection  
   * Optionally, a zoomed in view of the recent test period to highlight differences between models

The app also exposes a table of summary metrics computed in the offline diagnostics script so that a user can quickly see which model performs best for a given symbol.

### 8.2 Role in the project

The Streamlit interface does not change the underlying research results, but it plays two useful roles:

* It acts as a sanity check that the models behave as expected, because discrepancies and artefacts are easier to spot visually than in a table.  
* It provides an intuitive way for non technical users to interact with the models and see that, in this setting, the simple naive lag one forecast performs at least as well as the LSTM.

This kind of research-to-UI bridge is representative of how quantitative tools are often deployed inside a trading firm, where a back end research pipeline feeds into a front end used by portfolio managers and risk teams.

## 9. Conclusion and limitations

This project set out to test whether a univariate LSTM can outperform simple time series benchmarks for daily stock prediction on a realistic universe of liquid equities and indices.

Using data from 2010 to 2025 and an out of sample evaluation on the most recent years, I find that the naive lag one model, which simply copies the last observed value, uniformly achieves lower RMSE and MAE than the LSTM and moving average models. The five day moving average is usually second best, while the twenty day average and the LSTM perform worse.

The results support the view that daily prices of large liquid assets are very hard to predict from their own history alone. They also illustrate that deep learning models do not automatically generate alpha when applied to financial time series; careful feature design, richer input information and realistic evaluation are essential.

The main limitations of the current work are:

- Only univariate models are considered. Cross sectional and multi asset architectures could exploit additional structure.
- The feature set is restricted to past prices of each asset. No technical indicators, macro variables or volume information are included.
- The evaluation focuses on point forecast accuracy rather than trading performance net of costs.
- Hyper parameter tuning for the LSTM is intentionally simple to keep the project manageable.

These limitations point directly toward future work. Extending the framework to predict volatility, tail risk or cross sectional rankings, and combining price based features with macro and fundamental data, are promising directions for further research.

Despite these limitations, the project achieves its main objective: it builds an end to end pipeline for data collection, model training, evaluation and visualisation, and uses it to demonstrate that a widely promoted deep learning architecture does not outperform a well chosen classical benchmark in this setting. This is a valuable lesson for any aspiring quantitative researcher.


## 10. References

- Chevalier, G. (2018). LARNN: Linear Attention Recurrent Neural Network. arXiv preprint arXiv:1808.05578.
- Deepika, N., and Bhat, M. N. (2021). An efficient stock market prediction method based on Kalman filter. Journal of the Institution of Engineers Series B.
- Dhyani, B. (2020). Stock market forecasting technique using ARIMA model. International Journal of Recent Technology and Engineering.
- Various online resources on fundamental analysis and ARIMA modelling.

In [None]:
for symbol in UNIVERSE:
    df = load_price_history(symbol)
    # compute predictions for each model
    # y_test, y_pred_lstm, y_pred_naive, ...

In [None]:
import pandas as pd
from pathlib import Path

PRED_PATH = Path('data/notebooks/predictions_all_symbols.csv')
all_preds = []  # list of small DataFrames

if len(UNIVERSE) == 0:
    print('[Warning] UNIVERSE is empty; skipping predictions aggregation')
else:
    for symbol in UNIVERSE:
        # Attempt to load symbol data; skip symbol on failure
        try:
            df = load_price_history(symbol)
        except Exception as e:
            print(f'[Warning] failed to load price history for {symbol}: {e}')
            continue

        # If the variables like y_test are not available in this notebook context, skip
        if 'y_test' not in globals():
            print('[Warning] y_test not defined in this notebook; skipping per-symbol prediction assembly')
            break

        preds_sym = pd.DataFrame({
            'Date': y_test.index,
            'Symbol': symbol,
            'y_true': y_test.values,
            'LSTM': y_pred_lstm,
            'Naive lag1': y_pred_naive,
            'MA 5 day': y_pred_ma5,
            'MA 20 day': y_pred_ma20,
        })
        all_preds.append(preds_sym)

# at the end of the script
if len(all_preds) == 0:
    print('[Warning] no predictions assembled; predictions_all_symbols.csv will not be created')
else:
    preds_all = pd.concat(all_preds, ignore_index=True)
    preds_all.to_csv(PRED_PATH, index=False)
    print(f'[Diagnostics] Saved predictions to {PRED_PATH}')

In [None]:
# Run the diagnostics module in a notebook-friendly way
import runpy, importlib, sys, pathlib
module_name = 'data.notebooks.02_model_diagnostics'
try:
    # prefer running via module import when the package is on sys.path
    importlib.import_module('data')
    runpy.run_module(module_name, run_name='__main__')
except Exception as e:
    # Fallback: run the script by file path if the package import fails
    script_path = pathlib.Path('data/notebooks/02_model_diagnostics.py')
    if script_path.exists():
        print(f'[Info] falling back to executing {script_path}')
        runpy.run_path(str(script_path), run_name='__main__')
    else:
        print(f'[Warning] could not import module {module_name} and {script_path} not found: {e}')

## 9. Simple directional trading strategy based on forecasts

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

DATA_DIR = Path("data/notebooks")
preds = pd.read_csv(DATA_DIR / "predictions_all_symbols.csv", parse_dates=["Date"])

preds.head()

In [None]:
def evaluate_directional_strategy(df_symbol, model_col):
    """
    df_symbol: DataFrame for a single symbol, with columns
               Date, y_true, and model_col for predictions
    model_col: column name with predictions, e.g. "Naive lag1" or "LSTM"
    """
    df = df_symbol.sort_values("Date").copy()

    # position is 1 if predicted return > 0, else 0 (cash)
    df["position"] = (df[model_col] > 0).astype(int)

    # strategy return is position times realised return
    df["strategy_ret"] = df["position"] * df["y_true"]

    # cumulative equity curve (starting at 1)
    df["equity"] = (1 + df["strategy_ret"]).cumprod()

    # simple statistics
    hit_rate = ((df["y_true"] * df[model_col]) > 0).mean()
    avg_ret = df["strategy_ret"].mean()
    vol = df["strategy_ret"].std()
    sharpe = avg_ret / vol * (252 ** 0.5) if vol > 0 else 0.0

    stats = {
        "hit_rate": hit_rate,
        "avg_daily_ret": avg_ret,
        "daily_vol": vol,
        "sharpe": sharpe,
        "final_equity": df["equity"].iloc[-1],
    }
    return df, stats

In [None]:
symbol = "SPY"

preds_spy = preds[preds["Symbol"] == symbol]

models_to_compare = ["Naive lag1", "LSTM", "MA 5 day"]

results = {}

plt.figure(figsize=(10, 5))

for model in models_to_compare:
    df_model, stats = evaluate_directional_strategy(preds_spy[["Date", "y_true", model]].rename(columns={model: "pred"}).assign(Symbol=symbol), "pred")
    # store stats
    results[model] = stats
    plt.plot(df_model["Date"], df_model["equity"], label=model)

plt.title(f"{symbol} directional strategy equity curves (long only)")
plt.xlabel("Date")
plt.ylabel("Equity (start = 1)")
plt.legend()
plt.tight_layout()
plt.show()

pd.DataFrame(results).T

The table above shows that, even when we convert forecasts into a simple long only trading rule, the performance differences between models remain modest. In this example the naive lag one forecast again performs at least as well as the LSTM in terms of both final equity and Sharpe ratio, once we ignore transaction costs.

This reinforces the earlier conclusion that, given only the past price of a single asset, complex sequence models do not automatically translate into superior trading performance.