In [52]:
!pip install yfinance pandas numpy scikit-learn sentence-transformers tqdm




In [53]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from datetime import datetime, timedelta

# ---- Configuration ----
HEADLINES_CSV = "/content/spy_news.csv"     # your Polygon-style file
TICKER = "SPY"                      # ETF proxy for S&P 500
START_DATE = "2024-01-01"
END_DATE   = "2024-12-31"

SMALL_MODEL = "all-MiniLM-L6-v2"
LARGE_MODEL = "sentence-transformers/all-mpnet-base-v2"
SMALL_PCA_DIM, LARGE_PCA_DIM = 12, 14
RANDOM_SEED = 42
AGG_METHOD = "mean"



In [54]:
price_df = yf.download(TICKER, start=START_DATE, end=END_DATE, progress=False)
price_df = price_df.reset_index()[["Date", "Open", "High", "Low", "Close", "Volume"]]
price_df.rename(columns={"Date": "date"}, inplace=True)
price_df["date"] = pd.to_datetime(price_df["date"]).dt.normalize()
# Drop the 'Ticker' level from the columns
price_df.columns = price_df.columns.droplevel(level=1)
print(f"✅ Downloaded {len(price_df)} trading days of {TICKER}")
display(price_df.head())

✅ Downloaded 251 trading days of SPY


  price_df = yf.download(TICKER, start=START_DATE, end=END_DATE, progress=False)


Price,date,Open,High,Low,Close,Volume
0,2024-01-02,462.1308,463.608735,460.496259,462.610382,123623700
1,2024-01-03,460.437547,461.181413,458.225572,458.832397,103585900
2,2024-01-04,458.352754,460.956257,457.129306,457.354431,84232200
3,2024-01-05,457.560037,460.447388,456.522555,457.980927,86118900
4,2024-01-08,458.479996,464.665759,458.352753,464.518951,74879100


In [55]:
def load_headlines(path):
    df = pd.read_csv(path)
    df["date"] = pd.to_datetime(df["published_utc"]).dt.tz_convert(None).dt.normalize()
    df = df[["date", "title"]].dropna(subset=["title"])
    print(f"Loaded {len(df)} headlines from {df['date'].min().date()} to {df['date'].max().date()}")
    return df

headlines_df = load_headlines(HEADLINES_CSV)
headlines_df.head()


Loaded 1000 headlines from 2024-04-02 to 2025-11-03


Unnamed: 0,date,title
0,2024-04-02,Historic S&P 500 Rally In Q1 Flashes Bullish S...
1,2024-04-02,"S&P 500's Q1 Standouts: Energy, Communication ..."
2,2024-04-02,Could 3 Powerful Fund Managers Take Control Of...
3,2024-04-03,"The $64,000 Question: Can The Stellar Stock Ma..."
4,2024-04-03,Is Invesco Russell 1000 Equal Weight ETF (EQAL...


In [56]:
def get_embeddings(model, texts, batch_size=64):
    all_vecs = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding"):
        batch = texts[i:i+batch_size]
        vecs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        all_vecs.append(vecs)
    return np.vstack(all_vecs)

In [57]:
def make_daily_pca(model_name, n_components, headlines_df):
    model = SentenceTransformer(model_name)
    texts = headlines_df["title"].tolist()
    emb = get_embeddings(model, texts)

    pca = PCA(n_components=n_components, random_state=RANDOM_SEED)
    reduced = pca.fit_transform(emb)
    reduced_df = pd.DataFrame(reduced, columns=[f"pca_{i+1}" for i in range(n_components)])
    reduced_df["date"] = headlines_df["date"].values

    daily = reduced_df.groupby("date").agg(AGG_METHOD).reset_index()
    return daily, pca


In [58]:
def align_with_prices(pca_df, price_df):
    full_dates = pd.date_range(price_df["date"].min(), price_df["date"].max(), freq="D")
    pca_df = pca_df.set_index("date").reindex(full_dates)
    pca_df.index.name = "date"
    pca_df = pca_df.fillna(0.0)
    pca_df["has_news"] = (pca_df.sum(axis=1) != 0).astype(int)
    return pca_df.reset_index()


In [59]:
# --- Small model ---
small_daily, pca_small = make_daily_pca(SMALL_MODEL, SMALL_PCA_DIM, headlines_df)
small_full = align_with_prices(small_daily, price_df)
small_full.head()


Embedding: 100%|██████████| 16/16 [00:38<00:00,  2.42s/it]


Unnamed: 0,date,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,has_news
0,2024-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2024-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2024-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2024-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2024-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [60]:
# --- Large model ---
large_daily, pca_large = make_daily_pca(LARGE_MODEL, LARGE_PCA_DIM, headlines_df)
large_full = align_with_prices(large_daily, price_df)
large_full.head()


Embedding: 100%|██████████| 16/16 [01:45<00:00,  6.61s/it]


Unnamed: 0,date,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14,has_news
0,2024-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2024-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2024-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2024-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2024-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [61]:
merged_df = price_df.merge(small_full, on="date", how="left", suffixes=("", "_small"))
merged_df = merged_df.merge(large_full, on="date", how="left", suffixes=("", "_large"))
merged_df = merged_df.fillna(0.0)
print(f"Merged dataset shape: {merged_df.shape}")
merged_df.head()


Merged dataset shape: (251, 34)


Unnamed: 0,date,Open,High,Low,Close,Volume,pca_1,pca_2,pca_3,pca_4,...,pca_6_large,pca_7_large,pca_8_large,pca_9_large,pca_10_large,pca_11_large,pca_12_large,pca_13,pca_14,has_news_large
0,2024-01-02,462.1308,463.608735,460.496259,462.610382,123623700,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2024-01-03,460.437547,461.181413,458.225572,458.832397,103585900,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2024-01-04,458.352754,460.956257,457.129306,457.354431,84232200,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2024-01-05,457.560037,460.447388,456.522555,457.980927,86118900,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2024-01-08,458.479996,464.665759,458.352753,464.518951,74879100,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [62]:
merged_df.to_csv("features_1yr_SPY.csv", index=False)
print("✅ Saved one-year combined feature file: features_1yr_SPY.csv")


✅ Saved one-year combined feature file: features_1yr_SPY.csv


# Task
Train an Echo State Network (ESN) model on the merged time series data, predict values for 1-day and 7-day horizons, and evaluate the model's performance.

## Prepare data

### Subtask:
Split the merged data into training and testing sets.


**Reasoning**:
Define features and target variables, then split the data chronologically into training and testing sets.



In [63]:
features = [col for col in merged_df.columns if col not in ["date", "Close"]]
target = "Close"

split_date = merged_df["date"].iloc[int(len(merged_df) * 0.8)] # 80% for training

train_df = merged_df[merged_df["date"] < split_date]
test_df = merged_df[merged_df["date"] >= split_date]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

Training set shape: (200, 34)
Testing set shape: (51, 34)
