In [22]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display

project_root = Path.cwd()
while not (project_root / "src").exists() and project_root != project_root.parent:
    project_root = project_root.parent

src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from pairs_trading_etf.data.universe import load_configured_universe
from pairs_trading_etf.pipelines.pair_scan import PairScanConfig, run_pair_scan

sns.set_theme(style="whitegrid")

CONFIG_PATH = project_root / "configs" / "data.yaml"
PRICE_PATH = project_root / "data" / "raw" / "etf_prices.csv"
METADATA_PATH = project_root / "configs" / "etf_metadata.yaml"

CONFIG_PATH, PRICE_PATH, METADATA_PATH

(WindowsPath('i:/Winter-Break-Research/configs/data.yaml'),
 WindowsPath('i:/Winter-Break-Research/data/raw/etf_prices.csv'),
 WindowsPath('i:/Winter-Break-Research/configs/etf_metadata.yaml'))

In [23]:
universe = load_configured_universe(CONFIG_PATH, metadata_path=METADATA_PATH)
print(f"Universe '{universe.name}' with {len(universe.tickers)} tickers")

metadata_records = []
if universe.metadata:
    for ticker, meta in universe.metadata.items():
        metadata_records.append(
            {
                "ticker": ticker,
                "name": meta.name,
                "sector": meta.sector,
                "issuer": meta.issuer,
                "expense_ratio": meta.expense_ratio,
            }
        )

metadata_df = (
    pd.DataFrame(metadata_records).set_index("ticker").sort_index()
    if metadata_records
    else pd.DataFrame()
)
metadata_df.head(10)

Universe 'core_sectors' with 6 tickers


Unnamed: 0_level_0,name,sector,issuer,expense_ratio
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IYW,iShares U.S. Technology ETF,Technology,BlackRock,0.39
VFH,Vanguard Financials ETF,Financials,Vanguard,0.1
XLE,Energy Select Sector SPDR Fund,Energy,State Street Global Advisors,0.1
XLF,Financial Select Sector SPDR Fund,Financials,State Street Global Advisors,0.1
XLK,Technology Select Sector SPDR Fund,Technology,State Street Global Advisors,0.1
XLY,Consumer Discretionary Select Sector SPDR Fund,Consumer Discretionary,State Street Global Advisors,0.1


In [24]:
pair_scan_cfg = PairScanConfig(
    config_path=CONFIG_PATH,
    price_path=PRICE_PATH,
    output_path=None,
    list_name=None,
    metadata_path=METADATA_PATH,
    lookback_days=252,
    min_obs=150,
    min_corr=0.85,
    max_pairs=50,
    engle_granger_maxlag=1,
)

pair_scores = run_pair_scan(pair_scan_cfg)
print(f"Scored {len(pair_scores)} pairs.")
pair_scores.head()

Scored 2 pairs.


Unnamed: 0,universe,leg_x,leg_y,correlation,n_obs,spread_mean,spread_std,hedge_ratio,coint_statistic,coint_pvalue,half_life
0,core_sectors,XLK,IYW,0.982243,251,47.708427,1.884293,1.165181,-3.076174,0.093189,3867.770476
1,core_sectors,XLF,VFH,0.985795,251,2.13944,0.169182,0.391577,-2.909433,0.133332,


In [None]:
if pair_scores.empty:
    print("No qualifying pairs found; adjust thresholds and rerun.")
else:
    metadata_map = universe.metadata or {}

    def lookup_sector(ticker: str) -> str:
        entry = metadata_map.get(ticker)
        return entry.sector if entry else "Unknown"

    pair_scores = pair_scores.copy()
    pair_scores["sector_x"] = pair_scores["leg_x"].map(lookup_sector)
    pair_scores["sector_y"] = pair_scores["leg_y"].map(lookup_sector)
    pair_scores["pair_bucket"] = pair_scores.apply(
        lambda row: "Same Sector" if row["sector_x"] == row["sector_y"] else "Cross Sector",
        axis=1,
    )

    corr_summary = (
        pair_scores.groupby("pair_bucket")["correlation"]
        .agg(["count", "mean", "min", "max"])
        .rename(columns={"count": "n_pairs"})
        .round(3)
    )
    display(corr_summary)

    pvalue_summary = (
        pair_scores.groupby("pair_bucket")["coint_pvalue"]
        .agg(["mean", "median"])
        .round(4)
    )
    display(pvalue_summary)

SyntaxError: invalid syntax (3423215424.py, line 15)

In [None]:
if pair_scores.empty:
    print("Skipping count plot because no pairs were scored.")
else:
    count_df = (
        pair_scores["pair_bucket"]
        .value_counts()
        .rename_axis("pair_bucket")
        .reset_index(name="pairs")
    )
    plt.figure(figsize=(6, 4))
    sns.barplot(data=count_df, x="pair_bucket", y="pairs", palette="viridis")
    plt.title("Pair counts by bucket")
    plt.xlabel("")
    plt.ylabel("# of pairs")
    plt.show()

In [None]:
if pair_scores.empty:
    print("Skipping boxplot because no pairs were scored.")
else:
    plt.figure(figsize=(7, 4))
    sns.boxplot(data=pair_scores, x="pair_bucket", y="correlation", palette="pastel")
    plt.title("Correlation distribution by bucket")
    plt.xlabel("")
    plt.ylabel("Correlation")
    plt.show()

In [None]:
if pair_scores.empty:
    print("Skipping scatter plot because no pairs were scored.")
else:
    scatter_df = pair_scores.dropna(subset=["coint_pvalue"])
    if scatter_df.empty:
        print("No valid Engle–Granger p-values available for scatter plot.")
    else:
        plt.figure(figsize=(7, 4))
        sns.scatterplot(
            data=scatter_df,
            x="correlation",
            y="coint_pvalue",
            hue="pair_bucket",
            style="pair_bucket",
            s=80,
        )
        plt.axhline(0.05, color="red", linestyle="--", label="0.05 threshold")
        plt.title("Correlation vs Engle–Granger p-value")
        plt.xlabel("Correlation")
        plt.ylabel("Engle–Granger p-value")
        plt.xlim(0.75, 1.0)
        plt.legend(loc="upper right")
        plt.show()