In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

PROJECT_ROOT = Path("..")
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROC_DIR = PROJECT_ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR, PROC_DIR


(WindowsPath('../data/raw'), WindowsPath('../data/processed'))

In [2]:
def read_csv_safe(path: Path) -> pd.DataFrame:
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            continue
    raise ValueError(f"Could not read CSV: {path}")

def read_excel_safe(path: Path) -> pd.DataFrame:
    return pd.read_excel(path)

paths = {
    "VOO": RAW_DIR / "Holdings_details_S&P_500_ETF.csv",
    "SPY": RAW_DIR / "holdings-daily-us-en-spy.xlsx",
    "QQQ": RAW_DIR / "QQQ Daily.xlsx",
    "SCHD": RAW_DIR / "SCHD Daily.xlsx",
}

missing = [k for k,v in paths.items() if not v.exists()]
if missing:
    raise FileNotFoundError(f"Missing raw files for: {missing}\nFound: {sorted([p.name for p in RAW_DIR.glob('*')])}")

voo_raw = read_csv_safe(paths["VOO"])
spy_raw = read_excel_safe(paths["SPY"])
qqq_raw = read_excel_safe(paths["QQQ"])
schd_raw = read_excel_safe(paths["SCHD"])

(voo_raw.shape, spy_raw.shape, qqq_raw.shape, schd_raw.shape)



((530, 10), (511, 8), (105, 2), (101, 6))

In [3]:
def clean_text(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    x = re.sub(r"\s+", " ", x)
    return x

def to_decimal_weight(series: pd.Series) -> pd.Series:
    """
    Converts weights to decimal fractions.
    Handles:
      - "7.83%" -> 0.0783
      - "0.0783" -> 0.0783
      - 7.83 (percent units) -> 0.0783 (if values look like percent scale)
      - 0.0472 -> 0.0472
    """
    s = series.copy()

    # If it's strings with percent signs, remove and divide by 100
    s_str = s.astype(str)
    has_pct = s_str.str.contains("%", na=False)

    out = pd.to_numeric(
        s_str.str.replace("%", "", regex=False).str.replace(",", "", regex=False),
        errors="coerce"
    )

    # If percent sign existed anywhere, those rows are percent units
    out.loc[has_pct] = out.loc[has_pct] / 100.0

    # If itâ€™s numeric but looks like percent units (typical > 1), convert
    # Example: SPY weights like 7.77 should become 0.0777
    # If median > 1, assume percent scale
    med = np.nanmedian(out.values)
    if med > 1:
        out = out / 100.0

    return out

def standard_schema(df: pd.DataFrame, etf: str) -> pd.DataFrame:
    df = df.copy()
    df["etf"] = etf
    return df[["etf", "holding_name", "ticker", "weight"]]


In [4]:
spy = spy_raw.copy()

# Row index 3 contains real header: Name, Ticker, Identifier, SEDOL, Weight...
spy.columns = spy.iloc[3].astype(str)
spy = spy.iloc[4:].reset_index(drop=True)

spy = spy.rename(columns={
    "Name": "holding_name",
    "Ticker": "ticker",
    "Weight": "weight"
})

# Keep only the 3 fields we care about
spy = spy[["holding_name", "ticker", "weight"]]

# Clean text + weight
spy["holding_name"] = spy["holding_name"].map(clean_text).str.upper()
spy["ticker"] = spy["ticker"].map(clean_text).str.upper()
spy["weight"] = pd.to_numeric(spy["weight"], errors="coerce") / 100


# Drop non-holding rows
spy = spy.dropna(subset=["holding_name", "ticker", "weight"])
spy = spy[spy["weight"] > 0]

spy_clean = standard_schema(spy, "SPY")

spy_clean.head(), spy_clean["weight"].sum()


(3  etf       holding_name ticker    weight
 0  SPY        NVIDIA CORP   NVDA  0.077702
 1  SPY          APPLE INC   AAPL  0.066152
 2  SPY     MICROSOFT CORP   MSFT  0.051086
 3  SPY     AMAZON.COM INC   AMZN  0.033212
 4  SPY  ALPHABET INC CL A  GOOGL  0.030750,
 np.float64(0.99965068))

In [5]:
voo = voo_raw.copy()

voo = voo.rename(columns={
    "HOLDINGS": "holding_name",
    "TICKER": "ticker",
    "% OF FUNDS*": "weight"
})

voo = voo[["holding_name", "ticker", "weight"]]

voo["holding_name"] = voo["holding_name"].map(clean_text).str.upper()
voo["ticker"] = voo["ticker"].map(clean_text).str.upper()
voo["weight"] = to_decimal_weight(voo["weight"])

# Drop footer/blank rows
voo = voo.dropna(subset=["holding_name", "ticker", "weight"])
voo = voo[voo["weight"] > 0]

voo_clean = standard_schema(voo, "VOO")

voo_clean.head(), voo_clean["weight"].sum()


(   etf    holding_name ticker  weight
 0  VOO     NVIDIA CORP   NVDA  0.0783
 1  VOO       APPLE INC   AAPL  0.0646
 2  VOO  MICROSOFT CORP   MSFT  0.0539
 3  VOO  AMAZON.COM INC   AMZN  0.0392
 4  VOO    ALPHABET INC  GOOGL  0.0331,
 np.float64(0.9973))

In [6]:
qqq = qqq_raw.copy()

qqq = qqq.rename(columns={
    "Company": "holding_name",
    "Allocation": "weight"
})

# QQQ file may not include ticker, so we keep ticker as NaN for now
qqq["ticker"] = np.nan

qqq = qqq[["holding_name", "ticker", "weight"]]
qqq["holding_name"] = qqq["holding_name"].map(clean_text).str.upper()
qqq["weight"] = to_decimal_weight(qqq["weight"])

qqq = qqq.dropna(subset=["holding_name", "weight"])
qqq = qqq[qqq["weight"] > 0]

qqq_clean = standard_schema(qqq, "QQQ")

qqq_clean.head(), qqq_clean["weight"].sum()


(   etf    holding_name  ticker  weight
 0  QQQ     NVIDIA CORP     NaN  0.0875
 1  QQQ       APPLE INC     NaN  0.0744
 2  QQQ  MICROSOFT CORP     NaN  0.0587
 3  QQQ  AMAZON.COM INC     NaN  0.0418
 4  QQQ       TESLA INC     NaN  0.0410,
 np.float64(0.9996999999999999))

In [7]:
schd = schd_raw.copy()

schd = schd.rename(columns={
    "Fund Name": "holding_name",
    "Symbol": "ticker",
    "% of Assets": "weight"
})

schd = schd[["holding_name", "ticker", "weight"]]

schd["holding_name"] = schd["holding_name"].map(clean_text).str.upper()
schd["ticker"] = schd["ticker"].map(clean_text).str.upper()
schd["weight"] = pd.to_numeric(schd["weight"], errors="coerce")

schd = schd.dropna(subset=["holding_name", "ticker", "weight"])
schd = schd[schd["weight"] > 0]

# SCHD weights already sum ~1.0; keep as-is
schd_clean = standard_schema(schd, "SCHD")

schd_clean.head(), schd_clean["weight"].sum()


(    etf                holding_name ticker  weight
 0  SCHD        LOCKHEED MARTIN CORP    LMT  0.0472
 1  SCHD        TEXAS INSTRUMENT INC    TXN  0.0433
 2  SCHD  VERIZON COMMUNICATIONS INC     VZ  0.0431
 3  SCHD              CONOCOPHILLIPS    COP  0.0428
 4  SCHD                CHEVRON CORP    CVX  0.0426,
 np.float64(0.9990999999999999))

In [8]:
def validate(df, label):
    print("="*80)
    print(label, "| rows:", len(df))
    print("weight sum:", float(df["weight"].sum()))
    print("min/max weight:", float(df["weight"].min()), float(df["weight"].max()))
    print("missing ticker %:", float(df["ticker"].isna().mean())*100)
    display(df.head(5))

validate(spy_clean, "SPY CLEAN")
validate(voo_clean, "VOO CLEAN")
validate(qqq_clean, "QQQ CLEAN")
validate(schd_clean, "SCHD CLEAN")


SPY CLEAN | rows: 504
weight sum: 0.99965068
min/max weight: 5.3299999999999995e-05 0.07770234999999999
missing ticker %: 0.0


3,etf,holding_name,ticker,weight
0,SPY,NVIDIA CORP,NVDA,0.077702
1,SPY,APPLE INC,AAPL,0.066152
2,SPY,MICROSOFT CORP,MSFT,0.051086
3,SPY,AMAZON.COM INC,AMZN,0.033212
4,SPY,ALPHABET INC CL A,GOOGL,0.03075


VOO CLEAN | rows: 498
weight sum: 0.9973
min/max weight: 0.0001 0.0783
missing ticker %: 0.0


Unnamed: 0,etf,holding_name,ticker,weight
0,VOO,NVIDIA CORP,NVDA,0.0783
1,VOO,APPLE INC,AAPL,0.0646
2,VOO,MICROSOFT CORP,MSFT,0.0539
3,VOO,AMAZON.COM INC,AMZN,0.0392
4,VOO,ALPHABET INC,GOOGL,0.0331


QQQ CLEAN | rows: 103
weight sum: 0.9996999999999999
min/max weight: 0.0003 0.0875
missing ticker %: 100.0


Unnamed: 0,etf,holding_name,ticker,weight
0,QQQ,NVIDIA CORP,,0.0875
1,QQQ,APPLE INC,,0.0744
2,QQQ,MICROSOFT CORP,,0.0587
3,QQQ,AMAZON.COM INC,,0.0418
4,QQQ,TESLA INC,,0.041


SCHD CLEAN | rows: 99
weight sum: 0.9990999999999999
min/max weight: 0.0001 0.0472
missing ticker %: 0.0


Unnamed: 0,etf,holding_name,ticker,weight
0,SCHD,LOCKHEED MARTIN CORP,LMT,0.0472
1,SCHD,TEXAS INSTRUMENT INC,TXN,0.0433
2,SCHD,VERIZON COMMUNICATIONS INC,VZ,0.0431
3,SCHD,CONOCOPHILLIPS,COP,0.0428
4,SCHD,CHEVRON CORP,CVX,0.0426


In [9]:
import sys, subprocess, importlib

def ensure(pkg: str):
    try:
        importlib.import_module(pkg)
        print(f"âœ… {pkg} already installed")
    except ImportError:
        print(f"ðŸ“¦ Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        print(f"âœ… Installed {pkg}")

ensure("pyarrow")




âœ… pyarrow already installed


In [10]:
spy_clean.to_parquet(PROC_DIR / "spy_clean.parquet", index=False)
voo_clean.to_parquet(PROC_DIR / "voo_clean.parquet", index=False)
qqq_clean.to_parquet(PROC_DIR / "qqq_clean.parquet", index=False)
schd_clean.to_parquet(PROC_DIR / "schd_clean.parquet", index=False)

# Also save CSV for easy viewing
spy_clean.to_csv(PROC_DIR / "spy_clean.csv", index=False)
voo_clean.to_csv(PROC_DIR / "voo_clean.csv", index=False)
qqq_clean.to_csv(PROC_DIR / "qqq_clean.csv", index=False)
schd_clean.to_csv(PROC_DIR / "schd_clean.csv", index=False)

sorted([p.name for p in PROC_DIR.glob("*clean*")])

['qqq_clean.csv',
 'qqq_clean.parquet',
 'schd_clean.csv',
 'schd_clean.parquet',
 'spy_clean.csv',
 'spy_clean.parquet',
 'voo_clean.csv',
 'voo_clean.parquet']