# Notebook initialization

Run this cell to create all helper files and download data

In [6]:
%pip install -U pandas numpy matplotlib seaborn requests scikit-learn tqdm pyarrow --break-system-packages

Note: you may need to restart the kernel to use updated packages.


c:\Users\Michael\Code\ML-PortfolioManagement\.venv\Scripts\python.exe: No module named pip


In [7]:
%%writefile data_processing.py
"""
data_processing.py

Utility functions for:
- Loading and processing Fed-related CSV files into a single ML-ready DataFrame
- Optionally downloading Fed CSV files from GitHub into ./data/fed_csv
- Loading and processing S&P 500 OHLCV data

This module is generated dynamically inside the notebook
to comply with the "no external .py files" requirement.
"""

import os
import glob
import re
from pathlib import Path

import pandas as pd
import numpy as np
import requests
import yfinance as yf

# ---------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------

# Local directory where Fed CSV files are stored
DATA_DIR = "data/fed_csv"

# GitHub repo configuration (optional, for reproducibility)
REPO_OWNER = "MichaelCarloH"
REPO_NAME = "ML-PortfolioManagement"
FOLDER_PATH = "data/fed_csv"  # path inside the repo (no leading/trailing slash)


# ---------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------

def _log(msg: str, verbose: bool) -> None:
    if verbose:
        print(msg)


def ensure_data_dir(path: str = DATA_DIR) -> None:
    """
    Ensure that the local data directory exists.
    """
    os.makedirs(path, exist_ok=True)


def fetch_csv_from_github_folder(
    repo_owner: str = REPO_OWNER,
    repo_name: str = REPO_NAME,
    folder_path: str = FOLDER_PATH,
    local_dir: str = DATA_DIR,
    verbose: bool = False,
) -> bool:
    """
    Try to fetch all .csv files from a GitHub folder via the GitHub API
    and save them into `local_dir`.

    Returns
    -------
    bool
        True if at least one CSV was downloaded, False otherwise.
    """
    ensure_data_dir(local_dir)

    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}"
    response = requests.get(api_url)

    if response.status_code != 200:
        _log(
            f"[INFO] Could not access GitHub folder ({response.status_code}). "
            "Proceeding without remote download.",
            verbose,
        )
        return False

    items = response.json()
    csv_files = [item for item in items if item["name"].endswith(".csv")]

    if not csv_files:
        _log("[INFO] No CSV files found in the specified GitHub folder.", verbose)
        return False

    downloaded_any = False
    for item in csv_files:
        download_url = item["download_url"]
        filename = item["name"]
        local_path = os.path.join(local_dir, filename)

        file_response = requests.get(download_url)
        if file_response.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(file_response.content)
            downloaded_any = True
            _log(f"Downloaded: {filename}", verbose)
        else:
            _log(
                f"[WARN] Failed to download {filename}: {file_response.status_code}",
                verbose,
            )

    return downloaded_any


# ---------------------------------------------------------------------
# Fed CSV processing
# ---------------------------------------------------------------------

def load_all_csvs(
    local_dir: str = DATA_DIR,
    pattern: str = "*.csv",
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Load, merge, and process all Fed CSVs into an ML-ready, forward-filled dataframe.

    Steps:
    - Reads all CSVs in `local_dir` matching `pattern`.
    - Extracts date and outcome columns.
    - Merges all files into one unified dataframe.
    - Converts to ML-wide format with clean feature names.
    - Saves:
        - data/fed_events_merged.csv
        - data/fed_events_ml_ready.csv
        - data/fed_events_ml_ready_ffill.csv
    - Returns:
        - Forward-filled ML-ready DataFrame (fed_events_ml_ready_ffill.csv).
    """

    def log(msg: str):
        if verbose:
            print(msg)

    ensure_data_dir(local_dir)

    # --- Step 1: find CSV files ---
    fed_csv_path = Path(local_dir)
    csv_files = list(fed_csv_path.glob(pattern))
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {fed_csv_path} matching {pattern}")
    log(f"Found {len(csv_files)} CSV files")

    # --- Step 2: process each file ---
    all_dataframes = []
    for csv_file in csv_files:
        title = csv_file.stem
        log(f"Processing: {title}")

        try:
            df = pd.read_csv(csv_file)

            # Identify date column
            date_col = next(
                (c for c in df.columns if "date" in c.lower() and "utc" in c.lower()),
                None,
            )
            if date_col is None:
                log(f"  ⚠ No date column found, skipping {title}")
                continue

            # Identify outcome columns
            outcome_cols = [
                c for c in df.columns
                if c != date_col and "timestamp" not in c.lower()
            ]
            if not outcome_cols:
                log(f"  ⚠ No outcome columns found, skipping {title}")
                continue

            # Convert data types
            df[date_col] = pd.to_datetime(
                df[date_col],
                format="%m-%d-%Y %H:%M",
                errors="coerce",
            )
            for c in outcome_cols:
                df[c] = pd.to_numeric(df[c], errors="coerce")

            # Merge same-date rows: first non-null per column
            merged_df = df.groupby(date_col).agg({
                c: (lambda x: x.dropna().iloc[0] if len(x.dropna()) > 0 else None)
                for c in outcome_cols
            }).reset_index()

            merged_df["Title"] = title
            merged_df = merged_df.rename(columns={date_col: "Date"})
            merged_df = merged_df[["Date", "Title"] + outcome_cols]
            all_dataframes.append(merged_df)
            log(f"  ✓ {len(merged_df)} unique dates")

        except Exception as e:
            log(f"  ✗ Error processing {title}: {e}")

    if not all_dataframes:
        raise RuntimeError("No valid CSVs processed.")

    # --- Step 3: merge all dataframes ---
    log("Merging all dataframes...")
    all_outcome_cols = sorted(list({
        c
        for df in all_dataframes
        for c in df.columns
        if c not in ["Date", "Title"]
    }))

    final_dfs = []
    for df in all_dataframes:
        for c in all_outcome_cols:
            if c not in df.columns:
                df[c] = None
        final_dfs.append(df[["Date", "Title"] + all_outcome_cols])

    final_df = (
        pd.concat(final_dfs, ignore_index=True)
        .sort_values(["Date", "Title"])
        .reset_index(drop=True)
    )

    os.makedirs("data", exist_ok=True)
    merged_path = Path("data/fed_events_merged.csv")
    final_df.to_csv(merged_path, index=False)
    log(f"Saved merged dataframe to: {merged_path}")

    # --- Step 4: long format ---
    id_vars = ["Date", "Title"]
    value_cols = [c for c in final_df.columns if c not in id_vars]

    df_long = pd.melt(
        final_df,
        id_vars=id_vars,
        value_vars=value_cols,
        var_name="Outcome",
        value_name="Probability",
    ).dropna(subset=["Probability"])

    # --- Step 5: clean feature names ---
    def sanitize_feature_name(title: str, outcome: str) -> str:
        feature = f"{title}_{outcome}"
        feature = re.sub(r"[^a-zA-Z0-9_\s]", "", feature)
        feature = re.sub(r"\s+", "_", feature)
        feature = re.sub(r"_+", "_", feature).strip("_")
        return feature

    df_long["Feature"] = df_long.apply(
        lambda r: sanitize_feature_name(r["Title"], r["Outcome"]),
        axis=1,
    )

    # --- Step 6: pivot to ML-wide ---
    df_ml = df_long.pivot_table(
        index="Date",
        columns="Feature",
        values="Probability",
        aggfunc="first",
    ).reset_index()

    cols = ["Date"] + sorted([c for c in df_ml.columns if c != "Date"])
    df_ml = df_ml[cols]

    ml_ready_path = Path("data/fed_events_ml_ready.csv")
    df_ml.to_csv(ml_ready_path, index=False)
    log(f"Saved ML-ready dataframe to: {ml_ready_path}")

    # --- Step 7: forward-fill version ---
    df_ml_ffill = df_ml.sort_values("Date").copy()
    df_ml_ffill.iloc[:, 1:] = df_ml_ffill.iloc[:, 1:].ffill().fillna(0)

    ffill_path = Path("data/fed_events_ml_ready_ffill.csv")
    df_ml_ffill.to_csv(ffill_path, index=False)
    log(f"Saved forward-filled dataframe to: {ffill_path}")

    # --- Step 8: summary (optional) ---
    log("=== FED DATA SUMMARY ===")
    log(f"Rows (dates): {len(df_ml_ffill)}")
    log(f"Features: {len(df_ml_ffill.columns) - 1}")
    log(
        f"Date range: {df_ml_ffill['Date'].min()} → {df_ml_ffill['Date'].max()}"
    )

    return df_ml_ffill


def get_fed_data(verbose: bool = False) -> pd.DataFrame:
    """
    High-level convenience function for the notebook.

    - Tries to load processed data from local ./data/fed_csv.
    - If no raw CSVs exist, optionally tries to download them from GitHub.
    - Returns the final forward-filled ML-ready dataframe.
    """
    ensure_data_dir(DATA_DIR)

    # 1) Check for existing local CSVs
    csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))

    # 2) If none, try GitHub (non-fatal if it fails)
    if not csv_files:
        _log(
            "No local Fed CSV files found in ./data/fed_csv. "
            "Attempting to download from GitHub...",
            verbose,
        )
        ok = fetch_csv_from_github_folder(verbose=verbose)
        if not ok:
            raise FileNotFoundError(
                "No local Fed CSVs and GitHub download failed. "
                "Ensure ./data/fed_csv contains the required files "
                "in the submitted project."
            )

    # 3) Build and return processed dataframe
    return load_all_csvs(local_dir=DATA_DIR, verbose=verbose)


# ---------------------------------------------------------------------
# S&P 500 loader
# ---------------------------------------------------------------------

def load_sp500_data(
    ticker: str = "^GSPC",
    start: str = "2023-01-01",
    end: str = None,
    output_path: str = "data/sp500_ohlcv_returns.csv",
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Download, process, and save S&P 500 OHLCV and derived metrics.

    Returns a dataframe indexed by Date (UTC) with:
    - OHLCV
    - Daily_Return, Log_Return
    - High/Low and Open/Close ranges
    - Volume_MA_20, Volume_Ratio
    - Price_MA_20, Price_MA_50
    - Volatility_20
    """

    def log(msg: str):
        if verbose:
            print(msg)

    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    log(f"Downloading {ticker} from Yahoo Finance...")
    spx = yf.download(
        ticker,
        start=start,
        end=end,
        progress=False,
        auto_adjust=True,
    )

    if spx.empty:
        raise RuntimeError(f"No data returned for {ticker} from Yahoo Finance.")

    # Flatten multi-index columns if present
    if isinstance(spx.columns, pd.MultiIndex):
        spx.columns = spx.columns.get_level_values(0)

    # Metrics
    spx["Daily_Return"] = spx["Close"].pct_change()
    spx["Log_Return"] = np.log(spx["Close"] / spx["Close"].shift(1))
    spx["High_Low_Range"] = (spx["High"] - spx["Low"]) / spx["Close"]
    spx["Open_Close_Range"] = (spx["Open"] - spx["Close"]).abs() / spx["Close"]
    spx["Volume_MA_20"] = spx["Volume"].rolling(20).mean()
    spx["Volume_Ratio"] = spx["Volume"] / spx["Volume_MA_20"]
    spx["Price_MA_20"] = spx["Close"].rolling(20).mean()
    spx["Price_MA_50"] = spx["Close"].rolling(50).mean()
    spx["Volatility_20"] = spx["Daily_Return"].rolling(20).std()

    cols_to_keep = [
        "Open", "High", "Low", "Close", "Volume",
        "Daily_Return", "Log_Return",
        "High_Low_Range", "Open_Close_Range",
        "Volume_MA_20", "Volume_Ratio",
        "Price_MA_20", "Price_MA_50", "Volatility_20",
    ]
    available = [c for c in cols_to_keep if c in spx.columns]

    daily = spx[available].dropna().copy()
    daily.index.name = "Date (UTC)"

    daily.to_csv(output_path)
    log(f"Saved S&P 500 data to: {output_path.resolve()}")

    if verbose:
        log(f"Rows: {len(daily)}")
        log(f"Date range: {daily.index.min()} → {daily.index.max()}")

    return daily


Overwriting data_processing.py


# Guidelines

We outline a structured approach for presenting research findings. The framework is divided into several key segments:

1. Introduction
1. Dataset overview
1. Analytics and learning strategies
1. Empirical resuts: baseline and robustness 
1. Conclusion

The opening segment encompasses four essential elements:

- Contextual Background: What is the larger setting of the study? What makes this area of inquiry compelling? What are the existing gaps or limitations within the current body of research? What are some unanswered yet noteworthy questions?

- Project Contributions: What are the specific advancements made by this study, such as in data acquisition, algorithmic development, parameter adjustments, etc.?

- Summary of the main empirical results: What is the main statistical statement? is it significant (e.g. statistically or economically)? 

- Literature and Resource Citations: What are related academic papers? What are the github repositories, expert blogs, or software packages that used in this project? 

In the dataset profile, one should consider:

- The origin and composition of data utilized in the study. If the dataset is original, then provide the source code to ensure reproducibility.

- The chronological accuracy of the data points, verifying that the dates reflect the actual availability of information.

- A detailed analysis of descriptive statistics, with an emphasis on discussing the importance of the chosen graphs or metrics.

The analytics and machine learning methodologies section accounts for:

- A detailed explanation of the foundational algorithm.

- A description of the data partitioning strategy for training, validation and test.

- An overview of the parameter selection and optimization process.

To effectively convey the empirical findings, separate the baseline results from the additional robustness tests. Within the primary empirical outcomes portion, include:

- Key statistical evaluations (for instance, if presenting a backtest – provide a pnl graph alongside the Sharpe ratio).

- Insights into what primarily influences the results, such as specific characteristics or assets that significantly impact performance.

The robustness of empirical tests section should detail:

- Evaluation of the stability of the principal finding against variations in hyperparameters or algorithmic modifications.

Finally, the conclusive synthesis should recapitulate the primary findings, consider external elements that may influence the results, and hint at potential directions for further investigative work.

# Introduction

In this project of Machine Learning for Portfolio Managment and Trading, we are tasked to loosely investigate the effect of central banks policies on the market. In our proposed approach we look at betting data, specifically on FED decisions and seek to develop a tradable strategy. The data is sourced from Polymarket which is a decentralized prediction market platform that uses blockchain technology to let users bet on the outcomes of real-world events. 


## Contextual Background
- **Macro-financial setting:** The project operates at the intersection of monetary-policy expectations and equity-market positioning, focusing on how Federal Open Market Committee (FOMC) decisions ripple through the S&P 500. Polymarket prediction markets provide high-frequency probability updates on rate cuts and leadership scenarios,furthermore they operate perpetually not being restricted to traditional marker hours.
- **Motivation and gaps:** Prior academic and practitioner research documents that option-implied probabilities and macro-news sentiment foreshadow FOMC-day volatility, yet public prediction-market data remains underexplored. Existing studies either rely on proprietary dealer quotes or low-frequency survey data, leaving an opportunity to test whether crowd-sourced probability surfaces encode tradable signals.
- **Open questions:** Key open questions include (i) how quickly prediction-market probabilities anticipate rate moves, (ii) whether the implied probabilities add incremental signal once traditional macro data is controlled for, and (iii) the robustness of any trading edge once transaction costs and alternate market regimes are considered.

## Project Contributions
- **Integrated data acquisition:** Consolidated raw Polymarket CSV exports and manually curated Federal Reserve event datasets into harmonised, analysis-ready tables (e.g., `polymarket_fed_events_and_markets.csv`, `fed_events_ml_ready_ffill.csv`).
- **Feature engineering pipeline:** Produced merged S&P 500 and Polymarket features with forward-filled probability fields, calendar event encodings, and target variables capturing post-announcement returns (`sp500_fed_ml_ready_with_targets.csv`).
- **Model experimentation:** Implemented data loaders (`dataloaders/s&p_loader.py`) and notebooks (`notebooks/fed_probability_prediction.ipynb`) to evaluate supervised models that translate probability movements into trading rules, including Sharpe-ratio oriented strategy evaluations in `rate_cut_strategy_sharpe_analysis.csv` outputs.
- **Backtesting utilities:** Generated out-of-sample trade logs (`notebooks/outputs/trades_oos.csv`) that benchmark polymarket-informed strategies against baseline macro strategies.

## Summary of Empirical Results
- **Performance snapshot:** The `notebooks/outputs/trades_all_strategies.csv` output indicates that probability-driven strategies deliver positive average returns with improved risk-adjusted performance relative to naive benchmarks, as evidenced by Sharpe ratio calculations stored in `data/s&p_data/rate_cut_strategy_sharpe_analysis.csv`.
- **Economic significance:** Strategy evaluations demonstrate economically meaningful improvements in Sharpe ratios when prediction-market features are incorporated, highlighting the potential of crowd-sourced expectations to complement conventional macro indicators.

## Literature and Resource Citations
- **Related research:**
  - Andersen, T. G., Bollerslev, T., Diebold, F. X., & Vega, C. (2007). *Real-Time Price Discovery in Stock, Bond and Foreign Exchange Markets*.
  - Lucca, D. O., & Moench, E. (2015). *The Pre-FOMC Announcement Drift*.
  - Wolfers, J., & Zitzewitz, E. (2006). *Prediction Markets in Theory and Practice*.
- **Software and repositories:**
  - [Polymarket API & CSV exports](https://polymarket.com/)
  - [Pandas](https://pandas.pydata.org/), [scikit-learn](https://scikit-learn.org/), and related Python data-science libraries (dependencies managed via `uv` as specified in `uv.toml`).

## Dataset Profile
- **Source composition:**
  - **Polymarket market histories:** Stored under `data/s&p_data/`, including `polymarket-price-data-24-07-2025-24-10-2025-1761316761596.csv` and the merged `polymarket_fed_events_and_markets.csv` file.
  - **Federal Reserve event catalogues:** CSV files in `data/fed_csv/` and curated aggregations such as `fed_events_merged.csv`, providing policy-meeting metadata and outcome annotations.
  - **Market benchmarks:** S&P 500 price and returns series within `data/s&p_data/sp500_ohlcv_returns.csv` and derived merges (`sp500_fed_merged_ml_ready.csv`).
- **Processing workflow:** The `notebooks/data_exploration.ipynb` notebook surveys raw distributions, while `notebooks/data_processing.ipynb` applies cleaning, forward-filling, and target construction routines. Reproducible processing steps are scripted through reusable loaders in `dataloaders/s&p_loader.py`, ensuring downstream model notebooks operate on consistent feature matrices.
- **Reproducibility notes:** Raw Polymarket CSVs and Fed event files are preserved in the repository. Transformation logic is encapsulated in notebooks and loaders, enabling regeneration of the merged dataset. When extending the dataset, run the processing notebook or invoke the loader module to re-create `sp500_fed_ml_ready_with_targets.csv` from the raw components.

# Data

The data used is Polymarket betting quotes on several Fed related topics, csv data can be downloaded at: https://polymarket.com/search?_q=fed

In this notebook we point to the Github repository where static data obtained at the time of the analysis is located and can be used to reproduce the results of this project


In [8]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from data_processing import get_fed_data

fed_df = get_fed_data()
fed_df.head()

  pd.concat(final_dfs, ignore_index=True)


Feature,Date,Fed_Interest_Rates_November_2024_25_bps_decrease,Fed_Interest_Rates_November_2024_25_bps_increase,Fed_Interest_Rates_November_2024_50_bps_decrease,Fed_Interest_Rates_November_2024_75_bps_decrease,Fed_Interest_Rates_November_2024_No_Change,Fed_Interest_Rates_November_2024_Other,Fed_abolished_in_2025_Price,Fed_decision_in_December_25_bps_decrease,Fed_decision_in_December_25_bps_increase,...,Who_will_Trump_nominate_as_Fed_Chair_Larry_Lindsey,Who_will_Trump_nominate_as_Fed_Chair_Lorie_K_Logan,Who_will_Trump_nominate_as_Fed_Chair_Marc_Sumerlin,Who_will_Trump_nominate_as_Fed_Chair_Michelle_Bowman,Who_will_Trump_nominate_as_Fed_Chair_No_one_nominated_before_2027,Who_will_Trump_nominate_as_Fed_Chair_Philip_Jefferson,Who_will_Trump_nominate_as_Fed_Chair_Rick_Rieder,Who_will_Trump_nominate_as_Fed_Chair_Ron_Paul,Who_will_Trump_nominate_as_Fed_Chair_Scott_Bessent,Who_will_Trump_nominate_as_Fed_Chair_Stephen_Miran
0,2024-08-03,0.315,0.025,0.27,0.125,0.215,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-08-04,0.37,0.0225,0.265,0.125,0.215,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-08-05,0.355,0.0225,0.275,0.125,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-08-06,0.345,0.011,0.325,0.11,0.17,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-08-07,0.38,0.0125,0.275,0.105,0.13,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Merging Fed time series with S&P500 time series

In [9]:
from data_processing import load_sp500_data
spx = load_sp500_data(verbose=False)


In [11]:
# Standardize S&P 500 date column
spx = spx.reset_index().rename(columns={"Date (UTC)": "Date"})
fed_df["Date"] = pd.to_datetime(fed_df["Date"])
spx["Date"] = pd.to_datetime(spx["Date"])

# --------------------------------------------------
# 2. Full outer merge on Date (keep all dates)
# --------------------------------------------------
merged_df = pd.merge(
    fed_df,
    spx,
    on="Date",
    how="outer",    # full join
    sort=True
)

# Fill all non-date missing values with zeros
feature_cols = [c for c in merged_df.columns if c != "Date"]
merged_df[feature_cols] = merged_df[feature_cols].fillna(0)

# Sort chronologically
merged_df = merged_df.sort_values("Date").reset_index(drop=True)

merged_df

Unnamed: 0,Date,Fed_Interest_Rates_November_2024_25_bps_decrease,Fed_Interest_Rates_November_2024_25_bps_increase,Fed_Interest_Rates_November_2024_50_bps_decrease,Fed_Interest_Rates_November_2024_75_bps_decrease,Fed_Interest_Rates_November_2024_No_Change,Fed_Interest_Rates_November_2024_Other,Fed_abolished_in_2025_Price,Fed_decision_in_December_25_bps_decrease,Fed_decision_in_December_25_bps_increase,...,Volume,Daily_Return,Log_Return,High_Low_Range,Open_Close_Range,Volume_MA_20,Volume_Ratio,Price_MA_20,Price_MA_50,Volatility_20
0,2023-03-15 00:00:00,0.0000,0.000,0.000,0.000,0.000,0.0000,0.00,0.000,0.0000,...,6.594010e+09,-0.006981,-0.007005,0.014394,0.003903,4.499821e+09,1.465394,3984.615515,4003.584209,0.010362
1,2023-03-16 00:00:00,0.0000,0.000,0.000,0.000,0.000,0.0000,0.00,0.000,0.0000,...,5.695790e+09,0.017562,0.017410,0.025339,0.020542,4.580812e+09,1.243402,3975.249512,4006.307012,0.011282
2,2023-03-17 00:00:00,0.0000,0.000,0.000,0.000,0.000,0.0000,0.00,0.000,0.0000,...,9.354280e+09,-0.011019,-0.011081,0.014717,0.010736,4.841342e+09,1.932167,3966.561011,4007.580410,0.011149
3,2023-03-20 00:00:00,0.0000,0.000,0.000,0.000,0.000,0.0000,0.00,0.000,0.0000,...,5.347140e+09,0.008918,0.008879,0.010054,0.008630,4.906426e+09,1.089824,3960.185010,4010.449810,0.011415
4,2023-03-21 00:00:00,0.0000,0.000,0.000,0.000,0.000,0.0000,0.00,0.000,0.0000,...,4.920240e+09,0.012982,0.012899,0.009466,0.006740,4.946358e+09,0.994720,3960.461511,4012.605610,0.010976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801,2025-11-06 17:47:00,0.9535,0.003,0.021,0.003,0.025,0.0015,0.01,0.715,0.0125,...,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
802,2025-11-06 17:48:00,0.9535,0.003,0.021,0.003,0.025,0.0015,0.01,0.715,0.0125,...,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
803,2025-11-06 17:49:00,0.9535,0.003,0.021,0.003,0.025,0.0015,0.01,0.715,0.0125,...,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
804,2025-11-06 17:50:00,0.9535,0.003,0.021,0.003,0.025,0.0015,0.01,0.715,0.0125,...,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000


# Indicator fo fed market sentiment shift

In this section we aim to develop an indicator which can gain insights from the fed dataset we obtained. The dataset is composed of multiple bets which are vaailable at different times, since polymarket has been growing so has the volume and number of bets vaailable. We look at the variation in probability for all events. Example at one time we might have 25bps decrease in november, december, january and 50 bs decrease etc. Now we ocunt how many deltas were positive at each time instance and we follwo this coint for each time step then we apply moving averages to compare short term change and long term pricing of probabilioties and we develop a trading startegy based on corssing of such moving avergaes

## 4. Fed Market Sentiment Indicator — Construction, Logic & Interpretation

This section introduces a **market-implied sentiment indicator** based on Polymarket probabilities for Federal Reserve interest rate outcomes.  
The goal is to capture shifts in expectations — *how dovish or hawkish the market is becoming* — and study how these shifts relate to the S&P 500.

---

### 4.1. Background and Motivation

Polymarket lists multiple prediction markets for each Fed meeting — for example:
- `..._25_bps_decrease`
- `..._50_bps_decrease`
- `..._No_Change`
- `..._25_bps_increase`

Each column represents the **probability** that a given rate outcome will occur.  
As macroeconomic data, speeches, or risk events unfold, these probabilities move — revealing the market’s evolving beliefs about monetary policy.

Our goal is to summarize these changes into one interpretable metric — the **Net BPS Sentiment Indicator** — which measures whether the overall expectation is shifting *dovish* (toward cuts) or *hawkish* (toward hikes).

---

### 4.2. Step-by-Step Construction

At each time step \( t \) and for each outcome \( j \):

1. **Compute the change in probability**  
   \[
   \Delta p_{t,j} = p_{t,j} - p_{t-1,j}
   \]
   A positive \(\Delta p_{t,j}\) means the market is assigning more likelihood to that outcome today than yesterday.

2. **Assign a directional sign (economic interpretation)**  
   Each contract is labeled as either **dovish** or **hawkish**:
   \[
   w_j =
   \begin{cases}
   +1, & \text{if the outcome is dovish (rate cut)} \\
   -1, & \text{if the outcome is hawkish (rate hike or no change)}
   \end{cases}
   \]

   - A rise in the probability of a **rate cut** → dovish (+).  
   - A rise in the probability of a **rate hike or no change** → hawkish (–).

3. **Compute the Net Sentiment Count**
   \[
   S_t = \sum_j w_j \cdot \text{sign}(\Delta p_{t,j})
   \]

   - \( S_t > 0 \): the market shifted dovish (pricing more cuts).  
   - \( S_t < 0 \): the market shifted hawkish (pricing fewer cuts or hikes).

4. **Apply Smoothing (Moving Averages)**  
   Since \(S_t\) fluctuates rapidly, we apply:
   - **5-day MA:** captures short-term sentiment shifts.
   - **10-day MA:** captures the medium-term trend.

   The crossover between these smoothed signals helps identify persistent turning points in monetary sentiment.

---

### 4.3. Why the Indicator Oscillates

The raw \( S_t \) often swings sharply. This is expected because:

- **Contracts are interdependent.**  
  If the “25 bps cut” probability rises, the “no change” and “25 bps hike” probabilities must fall.
- **Market depth evolves.**  
  As Polymarket grows, more events and meetings appear, increasing the count of active contracts.
- **Data updates asynchronously.**  
  Not all markets update simultaneously, introducing short-term noise.

To filter this, we smooth \( S_t \) with moving averages, allowing focus on *persistent directional shifts* rather than micro movements.

---

### 4.4. Interpreting the Chart

Below, we plot the **S&P 500** (left axis, blue) against the **Net BPS Sentiment Indicator** (right axis, red):

- **Blue line:** S&P 500 close price.  
- **Dashed blue line:** 20-day moving average of S&P 500.  
- **Gray line:** Raw sentiment signal \( S_t \).  
- **Red lines:** 5-day and 10-day moving averages of \( S_t \).

**Interpretation:**
- Rising red lines → *Dovish momentum* — markets pricing more cuts or looser policy.  
- Falling red lines → *Hawkish momentum* — markets pricing hikes or fewer cuts.  
- Dovish phases often align with **rising equities**, while hawkish phases coincide with **equity pullbacks**.

---

### 4.5. Economic Intuition

The indicator captures the **direction of probability flow** among rate outcomes — essentially, how the market is repricing the expected *path* of policy rates.  

- When \( S_t > 0 \), traders are collectively shifting toward **easier monetary policy expectations**.  
- When \( S_t < 0 \), expectations are tightening, signaling **higher rates or delayed cuts**.

This makes the indicator a **forward-looking macro sentiment index** — derived not from opinions or news, but from *real-money pricing of monetary policy expectations*.
