In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

PROJECT_ROOT = Path("..")
PROC_DIR = PROJECT_ROOT / "data" / "processed"
PROC_DIR


WindowsPath('../data/processed')

In [2]:
def load_clean(etf: str) -> pd.DataFrame:
    pqt = PROC_DIR / f"{etf.lower()}_clean.parquet"
    csv = PROC_DIR / f"{etf.lower()}_clean.csv"

    if pqt.exists():
        return pd.read_parquet(pqt)

    if csv.exists():
        return pd.read_csv(csv)

    raise FileNotFoundError(f"Missing clean file for {etf}")

spy  = load_clean("SPY")
voo  = load_clean("VOO")
qqq  = load_clean("QQQ")
schd = load_clean("SCHD")

holdings = pd.concat([spy, voo, qqq, schd], ignore_index=True)
holdings.head()


Unnamed: 0,etf,holding_name,ticker,weight
0,SPY,NVIDIA CORP,NVDA,0.077702
1,SPY,APPLE INC,AAPL,0.066152
2,SPY,MICROSOFT CORP,MSFT,0.051086
3,SPY,AMAZON.COM INC,AMZN,0.033212
4,SPY,ALPHABET INC CL A,GOOGL,0.03075


In [3]:
W = holdings.pivot_table(
    index="holding_name",
    columns="etf",
    values="weight",
    aggfunc="sum",
    fill_value=0
)

portfolio_weights = {
    "SPY": 0.25,
    "VOO": 0.25,
    "QQQ": 0.25,
    "SCHD": 0.25,
}

port = pd.Series(0.0, index=W.index)

for etf, w in portfolio_weights.items():
    port += w * W[etf]

port = port.sort_values(ascending=False)

port.head(15)


holding_name
NVIDIA CORP                   0.060876
APPLE INC                     0.051288
MICROSOFT CORP                0.040921
AMAZON.COM INC                0.028553
BROADCOM INC                  0.020837
TESLA INC                     0.020361
META PLATFORMS INC            0.015825
CISCO SYSTEMS INC             0.015239
PEPSICO INC                   0.014927
ALPHABET INC                  0.014900
AMGEN INC                     0.013737
WALMART INC                   0.013606
CHEVRON CORP                  0.013526
LOCKHEED MARTIN CORP          0.012906
VERIZON COMMUNICATIONS INC    0.012467
dtype: float64

In [4]:
def hhi(weights: pd.Series) -> float:
    return float((weights ** 2).sum())

hhi_value = hhi(port)

hhi_value


0.014165992686819026

In [5]:
effective_holdings = 1 / hhi_value

effective_holdings


70.59159369258083

In [7]:
def concentration_metrics(df: pd.DataFrame, label: str):
    w = df["weight"]

    h = hhi(w)
    eff_n = 1 / h

    print("="*80)
    print(label)
    print("HHI:", round(h, 6))
    print("Effective Holdings:", round(eff_n, 2))
    print("Top 10 Weight Share:", round(w.sort_values(ascending=False).head(10).sum()*100, 2), "%")

concentration_metrics(spy, "SPY")
concentration_metrics(voo, "VOO")
concentration_metrics(qqq, "QQQ")
concentration_metrics(schd, "SCHD")



SPY
HHI: 0.019939
Effective Holdings: 50.15
Top 10 Weight Share: 37.02 %
VOO
HHI: 0.020744
Effective Holdings: 48.21
Top 10 Weight Share: 38.35 %
QQQ
HHI: 0.031281
Effective Holdings: 31.97
Top 10 Weight Share: 47.38 %
SCHD
HHI: 0.029259
Effective Holdings: 34.18
Top 10 Weight Share: 42.2 %


In [8]:
top10 = port.head(10).sum()
top20 = port.head(20).sum()
top50 = port.head(50).sum()

{
    "Portfolio HHI": round(hhi_value, 6),
    "Effective Holdings": round(effective_holdings, 2),
    "Top 10 Holdings Share (%)": round(top10*100, 2),
    "Top 20 Holdings Share (%)": round(top20*100, 2),
    "Top 50 Holdings Share (%)": round(top50*100, 2),
}


{'Portfolio HHI': 0.014166,
 'Effective Holdings': 70.59,
 'Top 10 Holdings Share (%)': np.float64(28.37),
 'Top 20 Holdings Share (%)': np.float64(40.78),
 'Top 50 Holdings Share (%)': np.float64(60.61)}

In [11]:
import numpy as np
import pandas as pd
from pathlib import Path

def build_weight_matrix(holdings: pd.DataFrame) -> pd.DataFrame:
    df = holdings.copy()
    df["holding_name"] = df["holding_name"].astype(str).str.strip().str.upper()
    df["etf"] = df["etf"].astype(str).str.strip().str.upper()
    return df.pivot_table(index="holding_name", columns="etf", values="weight", aggfunc="sum", fill_value=0.0)

def to_long_matrix(mat: pd.DataFrame, value_name: str) -> pd.DataFrame:
    long = mat.copy()
    long.index.name = "etf_a"
    return long.reset_index().melt(id_vars="etf_a", var_name="etf_b", value_name=value_name)

def hhi(weights: pd.Series) -> float:
    w = pd.to_numeric(weights, errors="coerce").fillna(0.0)
    return float((w ** 2).sum())

def summarize_concentration(weights: pd.Series) -> dict:
    w = pd.to_numeric(weights, errors="coerce").dropna()
    h = hhi(w)
    eff = (1 / h) if h > 0 else np.nan
    top10 = float(w.sort_values(ascending=False).head(10).sum())
    top20 = float(w.sort_values(ascending=False).head(20).sum())
    top50 = float(w.sort_values(ascending=False).head(50).sum())
    return {"hhi": float(h), "effective_holdings": float(eff),
            "top10_share": top10, "top20_share": top20, "top50_share": top50}

def export_tableau_extracts(out_dir: Path, holdings_clean: pd.DataFrame, portfolio: dict | None = None):
    out_dir.mkdir(parents=True, exist_ok=True)

    # base holdings
    (out_dir / "holdings_clean_long.csv").write_text("")  # ensure folder exists
    holdings_clean.to_csv(out_dir / "holdings_clean_long.csv", index=False)

    # overlap
    W = build_weight_matrix(holdings_clean)
    present = (W > 0).astype(int)
    overlap_count = present.T @ present
    holding_counts = present.sum(axis=0)
    overlap_pct = overlap_count.copy().astype(float)
    for a in overlap_pct.index:
        overlap_pct.loc[a, :] = overlap_pct.loc[a, :] / float(holding_counts[a])

    weighted_overlap = pd.DataFrame(index=W.columns, columns=W.columns, dtype=float)
    for a in W.columns:
        for b in W.columns:
            weighted_overlap.loc[a, b] = np.minimum(W[a], W[b]).sum()

    to_long_matrix(weighted_overlap, "overlap_weighted").to_csv(out_dir / "overlap_weighted_long.csv", index=False)
    to_long_matrix(overlap_count, "overlap_count").to_csv(out_dir / "overlap_count_long.csv", index=False)
    to_long_matrix(overlap_pct, "overlap_pct_of_a").to_csv(out_dir / "overlap_pct_long.csv", index=False)

    # concentration metrics
    rows = []
    for etf in ["SPY", "VOO", "QQQ", "SCHD"]:
        w = holdings_clean.loc[holdings_clean["etf"].str.upper() == etf, "weight"]
        s = summarize_concentration(w)
        rows.append({"entity_type": "ETF", "entity_name": etf, **s})

    # portfolio extracts
    if portfolio is not None:
        port = pd.Series(0.0, index=W.index)
        for etf, wt in portfolio.items():
            port += float(wt) * W[etf.upper()]
        port = port.sort_values(ascending=False)

        s = summarize_concentration(port)
        rows.append({"entity_type": "PORTFOLIO", "entity_name": "CUSTOM_PORTFOLIO", **s})

        port_df = port.head(200).reset_index()
        port_df.columns = ["holding_name", "portfolio_weight"]
        port_df.to_csv(out_dir / "portfolio_holdings_top200.csv", index=False)

    pd.DataFrame(rows).to_csv(out_dir / "concentration_metrics.csv", index=False)


In [12]:
PROJECT_ROOT = Path("..")
PROC_DIR = PROJECT_ROOT / "data" / "processed"
OUT_DIR = PROJECT_ROOT / "data" / "tableau_extracts"

spy  = pd.read_parquet(PROC_DIR / "spy_clean.parquet")
voo  = pd.read_parquet(PROC_DIR / "voo_clean.parquet")
qqq  = pd.read_parquet(PROC_DIR / "qqq_clean.parquet")
schd = pd.read_parquet(PROC_DIR / "schd_clean.parquet")

holdings_clean = pd.concat([spy, voo, qqq, schd], ignore_index=True)

export_tableau_extracts(
    OUT_DIR,
    holdings_clean,
    portfolio={"SPY":0.25, "VOO":0.25, "QQQ":0.25, "SCHD":0.25}
)

sorted([p.name for p in OUT_DIR.glob("*.csv")])


['concentration_metrics.csv',
 'holdings_clean_long.csv',
 'overlap_count_long.csv',
 'overlap_pct_long.csv',
 'overlap_weighted_long.csv',
 'portfolio_holdings_top200.csv']