In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

PROJECT_ROOT = Path("..")  # notebooks/ is one level down
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROC_DIR = PROJECT_ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR, PROC_DIR


(WindowsPath('../data/raw'), WindowsPath('../data/processed'))

In [2]:
raw_files = sorted([p.name for p in RAW_DIR.glob("*")])
raw_files


['Holdings_details_S&P_500_ETF.csv',
 'QQQ Daily.xlsx',
 'SCHD Daily.xlsx',
 'holdings-daily-us-en-spy.xlsx']

In [3]:
def read_csv_safe(path: Path) -> pd.DataFrame:
    # Try common encodings; fall back gracefully
    for enc in ("utf-8", "utf-8-sig", "cp1252"):
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # last resort
    return pd.read_csv(path, encoding="latin1")

def read_excel_safe(path: Path) -> pd.DataFrame:
    # Read first sheet by default
    return pd.read_excel(path)

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace("\n", " ", regex=False)
        .str.replace("\r", " ", regex=False)
    )
    return df

def quick_profile(df: pd.DataFrame, name: str, n=8):
    print("="*90)
    print(f"{name} | shape={df.shape}")
    print("- Columns (first 40) -")
    cols = list(df.columns)
    print(cols[:40], "..." if len(cols) > 40 else "")
    print("- Head -")
    display(df.head(n))
    print("- Dtypes -")
    display(df.dtypes.value_counts().to_frame("count"))
    print("- Missing (top 15) -")
    miss = (df.isna().mean().sort_values(ascending=False).head(15) * 100).round(2)
    display(miss.to_frame("% missing"))


In [9]:
!pip install openpyxl
import sys, subprocess, importlib

def ensure(pkg: str):
    try:
        importlib.import_module(pkg)
        print(f"âœ… {pkg} already installed in kernel: {sys.executable}")
    except ImportError:
        print(f"ðŸ“¦ Installing {pkg} into kernel: {sys.executable}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        importlib.invalidate_caches()
        importlib.import_module(pkg)
        print(f"âœ… Installed {pkg} successfully.")

ensure("openpyxl")






paths = {
    "VOO": RAW_DIR / "Holdings_details_S&P_500_ETF.csv",
    "SPY": RAW_DIR / "holdings-daily-us-en-spy.xlsx",
    "QQQ": RAW_DIR / "QQQ Daily.xlsx",
    "SCHD": RAW_DIR / "SCHD Daily.xlsx",
}

missing = [k for k,v in paths.items() if not v.exists()]
if missing:
    raise FileNotFoundError(f"Missing raw files for: {missing}\nFound: {sorted([p.name for p in RAW_DIR.glob('*')])}")

voo_raw = standardize_columns(read_csv_safe(paths["VOO"]))
spy_raw = standardize_columns(read_excel_safe(paths["SPY"]))
qqq_raw = standardize_columns(read_excel_safe(paths["QQQ"]))
schd_raw = standardize_columns(read_excel_safe(paths["SCHD"]))

quick_profile(voo_raw, "VOO (S&P 500 CSV)")
quick_profile(spy_raw, "SPY (Excel)")
quick_profile(qqq_raw, "QQQ (Excel)")
quick_profile(schd_raw, "SCHD (Excel)")


'pip' is not recognized as an internal or external command,
operable program or batch file.


ðŸ“¦ Installing openpyxl into kernel: c:\Users\Kyle\AppData\Local\Python\pythoncore-3.14-64\python.exe
âœ… Installed openpyxl successfully.
VOO (S&P 500 CSV) | shape=(530, 10)
- Columns (first 40) -
['Unnamed: 0', 'SEDOL', 'HOLDINGS', 'TICKER', '% OF FUNDS*', 'SUB-INDUSTRY', 'COUNTRY', 'SECURITYDEPOSITORYRECEIPTTYPE', 'MARKET VALUE*', 'SHARES'] 
- Head -


Unnamed: 0.1,Unnamed: 0,SEDOL,HOLDINGS,TICKER,% OF FUNDS*,SUB-INDUSTRY,COUNTRY,SECURITYDEPOSITORYRECEIPTTYPE,MARKET VALUE*,SHARES
0,,2379504,NVIDIA Corp,NVDA,7.83%,Semiconductors,US,---,"$118,321,417,727.43",619062511
1,,2046251,Apple Inc,AAPL,6.46%,"Technology Hardware, Storage & Peripherals",US,---,"$97,679,279,141.60",376442420
2,,2588173,Microsoft Corp,MSFT,5.39%,Systems Software,US,---,"$81,473,550,495.75",189345675
3,,2000019,Amazon.com Inc,AMZN,3.92%,Broadline Retail,US,---,"$59,306,487,392.30",247833211
4,,BYVY8G0,Alphabet Inc,GOOGL,3.31%,Interactive Media & Services,US,---,"$50,098,813,934.00",148221343
5,,BYY88Y7,Alphabet Inc,GOOG,2.65%,Interactive Media & Services,US,---,"$40,102,912,084.95",118461915
6,,BDZ78H9,Broadcom Inc,AVGO,2.64%,Semiconductors,US,---,"$39,857,650,459.90",120306823
7,,B7TL820,Meta Platforms Inc,META,2.63%,Interactive Media & Services,US,---,"$39,754,045,972.50",55483665


- Dtypes -


Unnamed: 0,count
str,10


- Missing (top 15) -


Unnamed: 0,% missing
Unnamed: 0,99.81
SHARES,4.91
SEDOL,4.72
HOLDINGS,4.72
% OF FUNDS*,4.72
TICKER,4.72
SUB-INDUSTRY,4.72
COUNTRY,4.72
SECURITYDEPOSITORYRECEIPTTYPE,4.72
MARKET VALUE*,4.72


SPY (Excel) | shape=(511, 8)
- Columns (first 40) -
['Fund Name:', 'State StreetÂ® SPDRÂ® S&P 500Â® ETF Trust', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'] 
- Head -


Unnamed: 0,Fund Name:,State StreetÂ® SPDRÂ® S&P 500Â® ETF Trust,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Ticker Symbol:,SPY,,,,,,
1,Holdings:,As of 12-Feb-2026,,,,,,
2,,,,,,,,
3,Name,Ticker,Identifier,SEDOL,Weight,Sector,Shares Held,Local Currency
4,NVIDIA CORP,NVDA,67066G104,2379504,7.770235,-,291810271,USD
5,APPLE INC,AAPL,037833100,2046251,6.615229,-,177443477,USD
6,MICROSOFT CORP,MSFT,594918104,2588173,5.108598,-,89251854,USD
7,AMAZON.COM INC,AMZN,023135106,2000019,3.321238,-,116817563,USD


- Dtypes -


Unnamed: 0,count
str,6
object,2


- Missing (top 15) -


Unnamed: 0,% missing
Unnamed: 3,1.17
Unnamed: 2,1.17
Unnamed: 5,1.17
Unnamed: 4,1.17
Unnamed: 6,1.17
Unnamed: 7,1.17
State StreetÂ® SPDRÂ® S&P 500Â® ETF Trust,0.78
Fund Name:,0.59


QQQ (Excel) | shape=(105, 2)
- Columns (first 40) -
['Company', 'Allocation'] 
- Head -


Unnamed: 0,Company,Allocation
0,NVIDIA Corp,8.75%
1,Apple Inc,7.44%
2,Microsoft Corp,5.87%
3,Amazon.com Inc,4.18%
4,Tesla Inc,4.10%
5,Meta Platforms Inc,3.70%
6,Walmart Inc,3.56%
7,Alphabet Inc Class A,3.50%


- Dtypes -


Unnamed: 0,count
str,2


- Missing (top 15) -


Unnamed: 0,% missing
Company,0.0
Allocation,0.0


SCHD (Excel) | shape=(101, 6)
- Columns (first 40) -
['Fund Name', 'CUSIP', 'Symbol', 'Quantity', '% of Assets', 'Market Value'] 
- Head -


Unnamed: 0,Fund Name,CUSIP,Symbol,Quantity,% of Assets,Market Value
0,LOCKHEED MARTIN CORP,539830109,LMT,6206924,0.0472,$4.0B
1,TEXAS INSTRUMENT INC,882508104,TXN,16252085,0.0433,$3.6B
2,VERIZON COMMUNICATIONS INC,92343V104,VZ,72968893,0.0431,$3.6B
3,CONOCOPHILLIPS,20825C104,COP,32323745,0.0428,$3.6B
4,CHEVRON CORP,166764100,CVX,19576790,0.0426,$3.6B
5,BRISTOL MYERS SQUIBB,110122108,BMY,58234735,0.0416,$3.5B
6,MERCK & CO INC,58933Y105,MRK,29017706,0.0413,$3.5B
7,ALTRIA GROUP INC,02209S103,MO,50807066,0.0406,$3.4B


- Dtypes -


Unnamed: 0,count
str,2
object,2
int64,1
float64,1


- Missing (top 15) -


Unnamed: 0,% missing
Fund Name,0.0
CUSIP,0.0
Symbol,0.0
Quantity,0.0
% of Assets,0.0
Market Value,0.0


In [10]:
NAME_HINTS = ["name", "security", "holding", "company", "issuer", "description", "ticker", "symbol"]
WEIGHT_HINTS = ["weight", "%", "percent", "portfolio", "market value", "notional"]

def find_candidates(df: pd.DataFrame, hints):
    cols = df.columns.astype(str)
    hits = []
    for c in cols:
        cl = c.lower()
        if any(h in cl for h in hints):
            hits.append(c)
    return hits

for label, df in [("VOO", voo_raw), ("SPY", spy_raw), ("QQQ", qqq_raw), ("SCHD", schd_raw)]:
    print("\n" + "="*90)
    print(label)
    print("Name-like candidates:", find_candidates(df, NAME_HINTS)[:25])
    print("Weight-like candidates:", find_candidates(df, WEIGHT_HINTS)[:25])



VOO
Name-like candidates: ['Unnamed: 0', 'HOLDINGS', 'TICKER', 'SECURITYDEPOSITORYRECEIPTTYPE']
Weight-like candidates: ['% OF FUNDS*', 'MARKET VALUE*']

SPY
Name-like candidates: ['Fund Name:', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']
Weight-like candidates: []

QQQ
Name-like candidates: ['Company']
Weight-like candidates: []

SCHD
Name-like candidates: ['Fund Name', 'Symbol']
Weight-like candidates: ['% of Assets', 'Market Value']


In [11]:
def inspect_text_field(df: pd.DataFrame, col: str, label: str):
    s = df[col].astype(str)
    print(f"\n{label} | {col}")
    print("Sample values:")
    display(s.dropna().head(12).to_frame(col))
    print("Unique count:", s.nunique(dropna=True))

# Pick the best candidate column(s) after looking at Cell 5 output.
# Start with the most obvious ones; change these if needed.
guess = {
    "VOO": None,
    "SPY": None,
    "QQQ": None,
    "SCHD": None,
}

# Auto-pick first "good" candidate (you can override manually)
def auto_pick_name(df):
    cands = find_candidates(df, NAME_HINTS)
    # prefer ones that look like a holding name/description over ticker
    preferred = [c for c in cands if "ticker" not in c.lower() and "symbol" not in c.lower()]
    return preferred[0] if preferred else (cands[0] if cands else None)

guess["VOO"] = auto_pick_name(voo_raw)
guess["SPY"] = auto_pick_name(spy_raw)
guess["QQQ"] = auto_pick_name(qqq_raw)
guess["SCHD"] = auto_pick_name(schd_raw)

guess


{'VOO': 'Unnamed: 0',
 'SPY': 'Fund Name:',
 'QQQ': 'Company',
 'SCHD': 'Fund Name'}

In [12]:
def auto_pick_weight(df):
    cands = find_candidates(df, WEIGHT_HINTS)
    # prefer explicit weight columns
    preferred = [c for c in cands if "weight" in c.lower()]
    return preferred[0] if preferred else (cands[0] if cands else None)

w_guess = {
    "VOO": auto_pick_weight(voo_raw),
    "SPY": auto_pick_weight(spy_raw),
    "QQQ": auto_pick_weight(qqq_raw),
    "SCHD": auto_pick_weight(schd_raw),
}
w_guess


{'VOO': '% OF FUNDS*', 'SPY': None, 'QQQ': None, 'SCHD': '% of Assets'}

In [13]:
def inspect_weight(df: pd.DataFrame, col: str, label: str):
    x = pd.to_numeric(df[col], errors="coerce")
    print(f"\n{label} | {col}")
    print("count:", x.notna().sum(), "missing:", x.isna().sum())
    print("min/median/max:", np.nanmin(x), np.nanmedian(x), np.nanmax(x))
    print("sum (raw):", np.nansum(x))
    display(x.describe().to_frame().T)

for label, df in [("VOO", voo_raw), ("SPY", spy_raw), ("QQQ", qqq_raw), ("SCHD", schd_raw)]:
    col = w_guess[label]
    if col:
        inspect_weight(df, col, label)
    else:
        print(f"\n{label}: No obvious weight column found.")



VOO | % OF FUNDS*
count: 0 missing: 530
min/median/max: nan nan nan
sum (raw): 0.0


  print("min/median/max:", np.nanmin(x), np.nanmedian(x), np.nanmax(x))
  print("min/median/max:", np.nanmin(x), np.nanmedian(x), np.nanmax(x))


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
% OF FUNDS*,0.0,,,,,,,



SPY: No obvious weight column found.

QQQ: No obvious weight column found.

SCHD | % of Assets
count: 101 missing: 0
min/median/max: 0.0 0.0032 0.0472
sum (raw): 0.9990999999999999


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
% of Assets,101.0,0.009892,0.01392,0.0,0.0007,0.0032,0.0122,0.0472
