In [1]:
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup


# ============================================================
# CONFIG (Required)
# ============================================================
USER_AGENT = "your.email@gmail.com"  # <-- set this
MIN_SECONDS_BETWEEN_REQUESTS = 0.22  # conservative (SEC guideline is <= 10 req/sec)
BASE_OUTPUT_DIR = Path(r"C:\Users\reigh\Desktop\Fin Statements")


# ============================================================
# DATA STRUCTURES
# ============================================================
@dataclass
class FilingRef:
    cik: int
    company: str
    ticker: str
    form: str
    filing_date: str
    accession_number: str
    primary_document: Optional[str]
    report_date: Optional[str]

    @property
    def cik_padded(self) -> str:
        return str(self.cik).zfill(10)

    @property
    def accession_no_dashes(self) -> str:
        return self.accession_number.replace("-", "")

    @property
    def archive_dir(self) -> str:
        return f"https://www.sec.gov/Archives/edgar/data/{self.cik}/{self.accession_no_dashes}/"

    @property
    def index_json_url(self) -> str:
        return self.archive_dir + "index.json"

    @property
    def full_submission_txt_url(self) -> str:
        return self.archive_dir + f"{self.accession_number}.txt"

    def primary_doc_url(self) -> Optional[str]:
        if not self.primary_document:
            return None
        return self.archive_dir + self.primary_document


# ============================================================
# SEC CLIENT (rate-limited + retries)
# ============================================================
class SECClient:
    def __init__(self, user_agent: str, min_interval_s: float = 0.22):
        if not user_agent or "@" not in user_agent:
            raise ValueError("Set USER_AGENT to something like 'Name (email@domain.com) - purpose'.")
        self.s = requests.Session()
        self.s.headers.update({
            "User-Agent": user_agent,
            "Accept-Encoding": "gzip, deflate",
        })
        self.min_interval_s = min_interval_s
        self._last_request_ts = 0.0
        self._ticker_map_cache = None

    def _sleep_if_needed(self):
        now = time.time()
        dt = now - self._last_request_ts
        if dt < self.min_interval_s:
            time.sleep(self.min_interval_s - dt)

    def get_json(self, url: str, max_retries: int = 6) -> dict:
        for attempt in range(max_retries):
            self._sleep_if_needed()
            r = self.s.get(url, timeout=30)
            self._last_request_ts = time.time()

            if r.status_code == 200:
                return r.json()

            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(min(2 ** attempt, 16))
                continue

            r.raise_for_status()

        raise RuntimeError(f"Failed GET JSON: {url}")

    def get_text(self, url: str, max_retries: int = 6) -> str:
        for attempt in range(max_retries):
            self._sleep_if_needed()
            r = self.s.get(url, timeout=60)
            self._last_request_ts = time.time()

            if r.status_code == 200:
                return r.text

            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(min(2 ** attempt, 16))
                continue

            r.raise_for_status()

        raise RuntimeError(f"Failed GET TEXT: {url}")

    # -------------------------
    # Ticker -> CIK
    # -------------------------
    def load_ticker_map(self) -> Dict[str, Dict]:
        if self._ticker_map_cache is not None:
            return self._ticker_map_cache

        url = "https://www.sec.gov/files/company_tickers.json"
        raw = self.get_json(url)

        m = {}
        for _, row in raw.items():
            t = row.get("ticker", "").upper()
            if t:
                m[t] = {"cik": int(row["cik_str"]), "title": row.get("title", "")}

        self._ticker_map_cache = m
        return m

    def cik_from_ticker(self, ticker: str) -> Tuple[int, str]:
        tm = self.load_ticker_map()
        key = ticker.upper().strip()
        if key not in tm:
            raise ValueError(f"Ticker not found in SEC ticker map: {ticker}")
        return tm[key]["cik"], tm[key]["title"]

    # -------------------------
    # Submissions
    # -------------------------
    def get_submissions(self, cik: int) -> dict:
        cik_padded = str(cik).zfill(10)
        url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
        return self.get_json(url)

    def get_submissions_file(self, filename: str) -> dict:
        # filenames in submissions['filings']['files'] are relative to:
        # https://data.sec.gov/submissions/<filename>
        url = f"https://data.sec.gov/submissions/{filename}"
        return self.get_json(url)


# ============================================================
# FILINGS: build full list (recent + older files)
# ============================================================
def collect_filings_for_form(
    sec: SECClient,
    ticker: str,
    form_base: str,
    mode: str = "latest",               # "latest" or "all"
    include_amendments: bool = False,   # include /A
    max_filings: int = 10
) -> Tuple[str, int, str, List[FilingRef]]:
    cik, title = sec.cik_from_ticker(ticker)
    sub = sec.get_submissions(cik)

    wanted_forms = {form_base}
    if include_amendments:
        wanted_forms.add(form_base + "/A")

    # Start with "recent"
    filings = []
    def ingest_recent(recent_json: dict):
        recent = recent_json.get("filings", {}).get("recent", {})
        forms = recent.get("form", [])
        acc = recent.get("accessionNumber", [])
        filed = recent.get("filingDate", [])
        report = recent.get("reportDate", [])
        prim = recent.get("primaryDocument", [])

        for i, f in enumerate(forms):
            if f in wanted_forms:
                filings.append(FilingRef(
                    cik=cik,
                    company=title or sub.get("name", ""),
                    ticker=ticker.upper(),
                    form=f,
                    filing_date=filed[i],
                    accession_number=acc[i],
                    primary_document=prim[i] if i < len(prim) else None,
                    report_date=report[i] if i < len(report) else None
                ))

    ingest_recent(sub)

    # If mode is "all", also ingest older filings referenced in "filings.files"
    if mode.lower() == "all":
        for fobj in sub.get("filings", {}).get("files", []):
            fname = fobj.get("name")
            if not fname:
                continue
            older = sec.get_submissions_file(fname)
            ingest_recent(older)

    # De-duplicate by accession number + form
    uniq = {}
    for fr in filings:
        key = (fr.form, fr.accession_number)
        uniq[key] = fr
    filings = list(uniq.values())

    # Sort by filing date desc
    filings.sort(key=lambda x: x.filing_date, reverse=True)

    if mode.lower() == "latest":
        filings = filings[:1]
    else:
        filings = filings[:max_filings]

    return ticker.upper(), cik, title, filings


# ============================================================
# STATEMENT EXTRACTION (FilingSummary.xml route)
# ============================================================
def list_filing_files(sec: SECClient, filing: FilingRef) -> List[str]:
    idx = sec.get_json(filing.index_json_url)
    return [it["name"] for it in idx.get("directory", {}).get("item", []) if "name" in it]

def parse_filing_summary_xml(xml_text: str) -> List[Dict]:
    soup = BeautifulSoup(xml_text, "xml")
    reports = []
    for r in soup.find_all("Report"):
        short = (r.ShortName.text or "").strip() if r.ShortName else ""
        longn = (r.LongName.text or "").strip() if r.LongName else ""
        html = (r.HtmlFileName.text or "").strip() if r.HtmlFileName else ""
        if html:
            reports.append({"short": short, "long": longn, "html": html})
    return reports

def score_report(rep: Dict, kind: str) -> int:
    txt = (rep["short"] + " " + rep["long"]).lower()
    s = 0

    # Penalize obvious non-statements
    if any(k in txt for k in ["notes", "note", "exhibit", "schedule", "quarterly data", "controls and procedures"]):
        s -= 5

    if kind == "BS":
        if "balance sheet" in txt or "financial position" in txt:
            s += 10
        if "consolidated" in txt:
            s += 2

    if kind == "IS":
        if any(k in txt for k in ["income statement", "statement of operations", "operations", "earnings"]):
            s += 10
        if "comprehensive income" in txt:
            s += 1  # not always the main IS

    if kind == "CFS":
        if "cash flow" in txt or "cash flows" in txt:
            s += 10
        if "supplemental" in txt:
            s -= 2

    return s

def pick_best_report(reports: List[Dict], kind: str) -> Optional[Dict]:
    scored = sorted([(score_report(r, kind), r) for r in reports], key=lambda x: x[0], reverse=True)
    if not scored:
        return None
    best_score, best = scored[0]
    return best if best_score > 0 else None

def parse_number(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        return x
    s = str(x).strip()
    if s in {"", "-", "–", "—", "−"}:
        return None

    # Remove common footnote markers
    s = re.sub(r"\[\d+\]", "", s)
    s = s.replace(",", "").replace("$", "").strip()

    neg = False
    if s.startswith("(") and s.endswith(")"):
        neg = True
        s = s[1:-1].strip()

    try:
        v = float(s)
        return -v if neg else v
    except:
        return None

def clean_statement_table(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [str(c).strip() for c in out.columns]

    # convert all except first col to numeric where possible
    if out.shape[1] >= 2:
        for c in out.columns[1:]:
            out[c] = out[c].map(parse_number)

    return out

def table_score(kind: str, df: pd.DataFrame) -> int:
    # heuristic: score tables that contain typical statement anchors in first column
    if df.shape[1] < 2 or df.shape[0] < 6:
        return -999

    first_col = df.iloc[:, 0].astype(str).str.lower()
    txt = " ".join(first_col.tolist())

    score = 0
    if kind == "BS":
        for k in ["total assets", "total liabilities", "stockholders", "shareholders", "equity"]:
            if k in txt:
                score += 3
    elif kind == "IS":
        for k in ["net income", "revenue", "gross profit", "operating income", "earnings per share"]:
            if k in txt:
                score += 3
    elif kind == "CFS":
        for k in ["operating activities", "investing activities", "financing activities", "net cash"]:
            if k in txt:
                score += 3

    # Prefer larger/denser statement tables
    score += min(df.shape[0], 60) // 10
    score += min(df.shape[1], 10)

    return score

def extract_statement_tables(sec: SECClient, filing: FilingRef, report_html_filename: str, kind: str) -> Dict:
    url = filing.archive_dir + report_html_filename
    html = sec.get_text(url)

    tables = pd.read_html(html)
    # keep statement-like tables
    keep = [t for t in tables if t.shape[1] >= 2 and t.shape[0] >= 6]

    cleaned = [clean_statement_table(t) for t in keep]

    best_idx = None
    best_score = -10**9
    for i, t in enumerate(cleaned):
        s = table_score(kind, t)
        if s > best_score:
            best_score = s
            best_idx = i

    return {
        "url": url,
        "all_tables": cleaned,
        "best_table_index": best_idx,
        "best_table": cleaned[best_idx] if (best_idx is not None and cleaned) else None
    }

def get_3_statements(sec: SECClient, filing: FilingRef) -> Dict[str, Optional[Dict]]:
    files = list_filing_files(sec, filing)
    if "FilingSummary.xml" not in files:
        # This happens for some filings; robust fallback would parse the primary doc.
        # For now, fail loudly so you know why it didn’t work.
        raise RuntimeError("FilingSummary.xml not found in filing folder. Fallback parsing not implemented in this version.")

    fs_xml = sec.get_text(filing.archive_dir + "FilingSummary.xml")
    reports = parse_filing_summary_xml(fs_xml)

    bs_rep = pick_best_report(reports, "BS")
    is_rep = pick_best_report(reports, "IS")
    cfs_rep = pick_best_report(reports, "CFS")

    out = {"BS": None, "IS": None, "CFS": None}

    if bs_rep:
        out["BS"] = {"report_name": bs_rep["short"] or bs_rep["long"],
                     **extract_statement_tables(sec, filing, bs_rep["html"], "BS")}
    if is_rep:
        out["IS"] = {"report_name": is_rep["short"] or is_rep["long"],
                     **extract_statement_tables(sec, filing, is_rep["html"], "IS")}
    if cfs_rep:
        out["CFS"] = {"report_name": cfs_rep["short"] or cfs_rep["long"],
                      **extract_statement_tables(sec, filing, cfs_rep["html"], "CFS")}

    return out


# ============================================================
# EXPORT
# ============================================================
def safe_sheet(name: str) -> str:
    # Excel tab limit: 31 chars
    return re.sub(r"[\[\]\:\*\?\/\\]", "_", name)[:31]

def write_filing_to_excel(stmts: Dict[str, Optional[Dict]], out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
        for kind in ["BS", "IS", "CFS"]:
            obj = stmts.get(kind)
            if not obj or obj.get("best_table") is None:
                continue

            # best table to main sheet
            obj["best_table"].to_excel(writer, sheet_name=kind, index=False)

            # extras
            for i, df in enumerate(obj.get("all_tables", []), start=1):
                if i - 1 == obj.get("best_table_index"):
                    continue
                df.to_excel(writer, sheet_name=safe_sheet(f"{kind}_{i}"), index=False)


# ============================================================
# INTERACTIVE RUNNER
# ============================================================
def interactive():
    print("SEC 10-Q/10-K -> 3-statement extractor (Balance Sheet, Income Statement, Cash Flow)")
    ticker = input("Ticker (e.g., AAPL): ").strip().upper()
    form = input("Filing type (10-K or 10-Q): ").strip().upper()
    if form not in {"10-K", "10-Q"}:
        raise ValueError("Filing type must be '10-K' or '10-Q'.")

    mode = input("Mode (latest/all): ").strip().lower()
    if mode not in {"latest", "all"}:
        raise ValueError("Mode must be 'latest' or 'all'.")

    include_amend = input("Include amendments (/A)? (y/n): ").strip().lower() == "y"
    max_filings = 1
    if mode == "all":
        max_filings = int(input("Max number of filings to pull (e.g., 5, 10, 20): ").strip())

    sec = SECClient(USER_AGENT, MIN_SECONDS_BETWEEN_REQUESTS)

    ticker_u, cik, company, filings = collect_filings_for_form(
        sec=sec,
        ticker=ticker,
        form_base=form,
        mode=mode,
        include_amendments=include_amend,
        max_filings=max_filings
    )

    print(f"\nCompany: {company} | Ticker: {ticker_u} | CIK: {cik}")
    print(f"Found {len(filings)} filing(s) for {form} (mode={mode}).")

    # Create just one folder named with the ticker
    ticker_dir = BASE_OUTPUT_DIR / ticker_u
    ticker_dir.mkdir(parents=True, exist_ok=True)

    for n, f in enumerate(filings, start=1):
        print(f"\n[{n}/{len(filings)}] {f.form} filed {f.filing_date} | accession {f.accession_number}")
        print(f"Filing folder: {f.archive_dir}")

        try:
            stmts = get_3_statements(sec, f)

            # Save directly inside the ticker folder
            out_file = ticker_dir / f"{ticker_u}_{f.form}_{f.filing_date}_{f.accession_number}.xlsx"
            write_filing_to_excel(stmts, out_file)
            print(f"Saved: {out_file}")

            for kind in ["BS", "IS", "CFS"]:
                obj = stmts.get(kind)
                if obj and obj.get("best_table") is not None:
                    print(f"  {kind}: {obj['report_name']} | {len(obj.get('all_tables', []))} table(s) | {obj['url']}")
                else:
                    print(f"  {kind}: NOT FOUND")

        except Exception as e:
            print(f"FAILED for {f.accession_number}: {e}")

    print("\nDone.")



# Run interactively (works in notebook too)
interactive()


SEC 10-Q/10-K -> 3-statement extractor (Balance Sheet, Income Statement, Cash Flow)


ValueError: Filing type must be '10-K' or '10-Q'.

In [12]:
import re
import time
import zipfile
from pathlib import Path
from typing import Dict, List, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup

# =========================
# CONFIG
# =========================
USER_AGENT = "Name (your.email@domain.com) - 13F overlap research"  # must include '@'
MIN_SECONDS_BETWEEN_REQUESTS = 0.22

BASE_OUTPUT_DIR = Path(r"C:\Users\reigh\Desktop\Fin Statements")
OUT_FOLDER_NAME = "13F_Bulk_Overlap"

# Overlap behavior
DISTINGUISH_OPTIONS = False          # if True: overlap key = CUSIP|PUTCALL (usually not recommended)
EXCLUDE_OPTIONS_FROM_OVERLAP = True  # if True: ignore rows with PUTCALL for overlap + weights/totals

# Overlap definition (per period)
OVERLAP_MIN_MANAGERS = 2

# Auto-change analysis
AUTO_USE_LAST_N_PERIODS_PER_MANAGER = 2   # <-- key: last 2 reporting periods per manager

# Idea-generation thresholds (tuned for 5–10 CIKs)
HIGH_CONVICTION_MAX_WEIGHT_PCT = 3.0
CROWDED_MIN_ABS = 3
CROWDED_MIN_FRAC = 0.50

# Change classification thresholds (to avoid noise)
WEIGHT_CHANGE_PCT_THRESHOLD = 0.05  # 0.05% weight move treated as "unchanged" band

SEC_13F_DATASETS_PAGE = "https://www.sec.gov/data-research/sec-markets-data/form-13f-data-sets"

# Excel limits
EXCEL_MAX_ROWS = 1_000_000


# =========================
# SEC CLIENT
# =========================
class SECClient:
    def __init__(self, user_agent: str, min_interval_s: float = 0.22):
        if not user_agent or "@" not in user_agent:
            raise ValueError("Set USER_AGENT like 'Name (your@email.com) - purpose'.")
        self.s = requests.Session()
        self.s.headers.update({
            "User-Agent": user_agent,
            "Accept-Encoding": "gzip, deflate",
        })
        self.min_interval_s = min_interval_s
        self._last_request_ts = 0.0

    def _sleep_if_needed(self):
        now = time.time()
        dt = now - self._last_request_ts
        if dt < self.min_interval_s:
            time.sleep(self.min_interval_s - dt)

    def get(self, url: str, stream: bool = False, max_retries: int = 6) -> requests.Response:
        for attempt in range(max_retries):
            self._sleep_if_needed()
            r = self.s.get(url, timeout=60, stream=stream)
            self._last_request_ts = time.time()
            if r.status_code == 200:
                return r
            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(min(2 ** attempt, 16))
                continue
            r.raise_for_status()
        raise RuntimeError(f"Failed GET: {url}")


# =========================
# HELPERS
# =========================
def normalize_cik(x: str) -> str:
    s = re.sub(r"\D", "", x.strip())
    if not s:
        raise ValueError(f"Bad CIK: {x}")
    return str(int(s)).zfill(10)

def parse_dd_mon_yyyy(s: str) -> pd.Timestamp:
    return pd.to_datetime(s, format="%d-%b-%Y", errors="coerce")

def holding_key(df: pd.DataFrame) -> pd.Series:
    if DISTINGUISH_OPTIONS:
        return df["CUSIP"].astype(str) + "|" + df["PUTCALL"].fillna("").astype(str)
    return df["CUSIP"].astype(str)

def find_file_by_token(folder: Path, token: str) -> Path:
    token_u = token.upper()
    for p in folder.rglob("*"):
        if p.is_file() and token_u in p.name.upper() and p.suffix.lower() in {".tsv", ".txt"}:
            return p
    raise FileNotFoundError(f"Could not find a TSV/TXT file containing '{token}' under {folder}")

def compute_pairwise(keys_by_mgr: Dict[str, set]) -> pd.DataFrame:
    mgrs = sorted(keys_by_mgr.keys())
    rows = []
    for i in range(len(mgrs)):
        for j in range(i + 1, len(mgrs)):
            a, b = mgrs[i], mgrs[j]
            sa, sb = keys_by_mgr[a], keys_by_mgr[b]
            inter = len(sa & sb)
            union = len(sa | sb)
            rows.append({
                "ManagerA": a,
                "ManagerB": b,
                "OverlapCount": inter,
                "UnionCount": union,
                "Jaccard": (inter / union) if union else 0.0
            })
    if not rows:
        return pd.DataFrame(columns=["ManagerA", "ManagerB", "OverlapCount", "UnionCount", "Jaccard"])
    return pd.DataFrame(rows).sort_values(["OverlapCount", "Jaccard"], ascending=False)

def safe_excel_writer_path(out_dir: Path, base_name: str) -> Path:
    """
    If the file is open in Excel, write to a timestamped filename instead.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    base_path = out_dir / f"{base_name}.xlsx"
    try:
        with open(base_path, "ab"):
            pass
        return base_path
    except PermissionError:
        ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        return out_dir / f"{base_name}_{ts}.xlsx"

def clean_sheet_name(s: str) -> str:
    s = re.sub(r"[^A-Za-z0-9 _-]", "", str(s)).strip()
    return (s[:31] or "sheet")

def write_df_split(writer: pd.ExcelWriter, df: pd.DataFrame, sheet_base: str, max_rows: int = EXCEL_MAX_ROWS) -> None:
    sheet_base = clean_sheet_name(sheet_base)
    if df is None or df.empty:
        pd.DataFrame().to_excel(writer, sheet_name=sheet_base[:31], index=False)
        return

    n = len(df)
    if n <= max_rows:
        df.to_excel(writer, sheet_name=sheet_base[:31], index=False)
        return

    k = 0
    start = 0
    while start < n:
        k += 1
        chunk = df.iloc[start:start + max_rows].copy()
        suffix = f"_{k}"
        name = (sheet_base[: (31 - len(suffix))] + suffix)[:31]
        chunk.to_excel(writer, sheet_name=name, index=False)
        start += max_rows


# =========================
# DISCOVER + DOWNLOAD DATASET ZIP
# =========================
def list_dataset_links(sec: SECClient, max_items: int = 12) -> List[Tuple[str, str]]:
    html = sec.get(SEC_13F_DATASETS_PAGE).text
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for a in soup.find_all("a"):
        href = a.get("href") or ""
        text = a.get_text(" ", strip=True)
        if href.endswith("_form13f.zip"):
            if href.startswith("/"):
                href = "https://www.sec.gov" + href
            links.append((text, href))
    return links[:max_items]

def download_zip(sec: SECClient, url: str, out_path: Path) -> None:
    r = sec.get(url, stream=True)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                f.write(chunk)


# =========================
# LOAD + FILTER TABLES
# =========================
def load_submission(submission_path: Path) -> pd.DataFrame:
    df = pd.read_csv(submission_path, sep="\t", dtype=str)
    df["FILING_DATE_TS"] = df["FILING_DATE"].apply(parse_dd_mon_yyyy)
    df["PERIODOFREPORT_TS"] = df["PERIODOFREPORT"].apply(parse_dd_mon_yyyy)
    df["CIK"] = df["CIK"].astype(str).str.zfill(10)
    return df

def pick_latest_accessions_per_period(sub: pd.DataFrame, target_ciks: List[str]) -> pd.DataFrame:
    """
    One row per (CIK, PERIODOFREPORT) selecting latest FILING_DATE (so amendment wins).
    """
    sub = sub[sub["SUBMISSIONTYPE"].isin(["13F-HR", "13F-HR/A"])].copy()
    sub = sub[sub["CIK"].isin(target_ciks)].copy()

    sub = sub.sort_values(["CIK", "PERIODOFREPORT_TS", "FILING_DATE_TS"], ascending=[True, True, False])
    picked = sub.drop_duplicates(subset=["CIK", "PERIODOFREPORT_TS"], keep="first")
    return picked[[
        "ACCESSION_NUMBER", "CIK", "SUBMISSIONTYPE", "FILING_DATE", "PERIODOFREPORT",
        "FILING_DATE_TS", "PERIODOFREPORT_TS"
    ]].copy()

def pick_last_n_periods_per_manager(picked_per_period: pd.DataFrame, n: int) -> pd.DataFrame:
    """
    For each CIK, select last n reporting periods (most recent PERIODOFREPORT_TS).
    """
    out = (
        picked_per_period.sort_values(["CIK", "PERIODOFREPORT_TS"], ascending=[True, False])
        .groupby("CIK", dropna=False)
        .head(n)
        .copy()
    )
    return out

def load_coverpage(coverpage_path: Path, accessions: set) -> pd.DataFrame:
    df = pd.read_csv(coverpage_path, sep="\t", dtype=str)
    df = df[df["ACCESSION_NUMBER"].isin(accessions)].copy()
    return df

def load_infotable_chunked(infotable_path: Path, accessions: set) -> pd.DataFrame:
    usecols = [
        "ACCESSION_NUMBER", "INFOTABLE_SK", "NAMEOFISSUER", "TITLEOFCLASS",
        "CUSIP", "FIGI", "VALUE", "SSHPRNAMT", "SSHPRNAMTTYPE", "PUTCALL",
        "INVESTMENTDISCRETION", "OTHERMANAGER", "VOTING_AUTH_SOLE", "VOTING_AUTH_SHARED", "VOTING_AUTH_NONE"
    ]
    chunks = []
    for ch in pd.read_csv(infotable_path, sep="\t", dtype=str, usecols=lambda c: c in usecols, chunksize=400_000):
        ch = ch[ch["ACCESSION_NUMBER"].isin(accessions)]
        if not ch.empty:
            chunks.append(ch)
    if not chunks:
        return pd.DataFrame(columns=usecols)
    df = pd.concat(chunks, ignore_index=True)

    df["CUSIP"] = df["CUSIP"].astype(str).str.replace(r"\s+", "", regex=True)

    for c in ["VALUE", "SSHPRNAMT", "VOTING_AUTH_SOLE", "VOTING_AUTH_SHARED", "VOTING_AUTH_NONE"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", ""), errors="coerce")

    if "PUTCALL" in df.columns:
        df["PUTCALL"] = df["PUTCALL"].replace("", pd.NA)

    return df


# =========================
# CHANGE ANALYSIS
# =========================
def build_fund_changes(mgr_cusip_vals: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    For each manager, compare latest period vs previous period within the selected data.
    Returns:
      - fund_changes_long: one row per CUSIP with NEW/EXIT/INCREASE/DECREASE/UNCHANGED
      - fund_changes_summary: manager-level summary
    """
    if mgr_cusip_vals.empty:
        return pd.DataFrame(), pd.DataFrame()

    # Determine last 2 periods per manager from available rows (already filtered earlier)
    periods_by_mgr = (
        mgr_cusip_vals[["manager", "PERIODOFREPORT_TS"]].drop_duplicates()
        .sort_values(["manager", "PERIODOFREPORT_TS"], ascending=[True, False])
    )

    mgr_to_two = (
        periods_by_mgr.groupby("manager", dropna=False)["PERIODOFREPORT_TS"]
        .apply(lambda s: list(s.head(2)))
        .to_dict()
    )

    rows = []
    for mgr, per_list in mgr_to_two.items():
        if len(per_list) < 2:
            continue
        p_curr_ts, p_prev_ts = per_list[0], per_list[1]

        curr = mgr_cusip_vals[(mgr_cusip_vals["manager"] == mgr) & (mgr_cusip_vals["PERIODOFREPORT_TS"] == p_curr_ts)].copy()
        prev = mgr_cusip_vals[(mgr_cusip_vals["manager"] == mgr) & (mgr_cusip_vals["PERIODOFREPORT_TS"] == p_prev_ts)].copy()

        # Outer join on CUSIP
        j = curr.merge(
            prev,
            on=["manager", "CUSIP"],
            how="outer",
            suffixes=("_curr", "_prev")
        )

        # Carry period labels
        j["PERIOD_CURR_TS"] = p_curr_ts
        j["PERIOD_PREV_TS"] = p_prev_ts

        # Prefer current labels else previous
        for col in ["PERIODOFREPORT", "NAMEOFISSUER", "TITLEOFCLASS"]:
            j[col] = j[f"{col}_curr"].combine_first(j[f"{col}_prev"])

        # Fill missing numeric values
        for col in ["PositionValue_AsFiled", "Total13FValue_AsFiled", "WeightPct"]:
            j[f"{col}_curr"] = j[f"{col}_curr"].fillna(0)
            j[f"{col}_prev"] = j[f"{col}_prev"].fillna(0)

        j["DeltaValue_AsFiled"] = j["PositionValue_AsFiled_curr"] - j["PositionValue_AsFiled_prev"]
        j["DeltaWeightPct"] = j["WeightPct_curr"] - j["WeightPct_prev"]

        def classify(r):
            had_prev = r["PositionValue_AsFiled_prev"] > 0
            has_curr = r["PositionValue_AsFiled_curr"] > 0
            if (not had_prev) and has_curr:
                return "NEW"
            if had_prev and (not has_curr):
                return "EXIT"
            # both present
            if abs(r["DeltaWeightPct"]) < WEIGHT_CHANGE_PCT_THRESHOLD:
                return "UNCHANGED"
            return "INCREASE" if r["DeltaWeightPct"] > 0 else "DECREASE"

        j["ChangeType"] = j.apply(classify, axis=1)

        out_cols = [
            "manager",
            "PERIODOFREPORT", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS",
            "PERIOD_PREV_TS", "PERIOD_CURR_TS",
            "PositionValue_AsFiled_prev", "WeightPct_prev",
            "PositionValue_AsFiled_curr", "WeightPct_curr",
            "DeltaValue_AsFiled", "DeltaWeightPct",
            "ChangeType"
        ]
        rows.append(j[out_cols])

    fund_changes_long = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()

    if fund_changes_long.empty:
        return fund_changes_long, pd.DataFrame()

    # Human-readable period strings
    fund_changes_long["PERIOD_PREV"] = fund_changes_long["PERIOD_PREV_TS"].dt.strftime("%d-%b-%Y")
    fund_changes_long["PERIOD_CURR"] = fund_changes_long["PERIOD_CURR_TS"].dt.strftime("%d-%b-%Y")

    # Summary per manager
    summ = (
        fund_changes_long.groupby(["manager", "PERIOD_PREV", "PERIOD_CURR"], dropna=False)
        .agg(
            NewPositions=("ChangeType", lambda s: (s == "NEW").sum()),
            Exits=("ChangeType", lambda s: (s == "EXIT").sum()),
            Increases=("ChangeType", lambda s: (s == "INCREASE").sum()),
            Decreases=("ChangeType", lambda s: (s == "DECREASE").sum()),
            Unchanged=("ChangeType", lambda s: (s == "UNCHANGED").sum()),
            SumDeltaWeightPct=("DeltaWeightPct", "sum"),
            SumAbsDeltaWeightPct=("DeltaWeightPct", lambda x: x.abs().sum()),
            SumDeltaValue_AsFiled=("DeltaValue_AsFiled", "sum"),
        )
        .reset_index()
        .sort_values(["SumAbsDeltaWeightPct"], ascending=[False])
    )

    # Sort long table for usability
    fund_changes_long = fund_changes_long.sort_values(
        ["manager", "ChangeType", "DeltaWeightPct"],
        ascending=[True, True, False]
    )

    return fund_changes_long, summ


# =========================
# MAIN
# =========================
def interactive_bulk_13f():
    sec = SECClient(USER_AGENT, MIN_SECONDS_BETWEEN_REQUESTS)

    cik_input = input("Enter manager CIKs (comma-separated): ").strip()
    target_ciks = [normalize_cik(x) for x in cik_input.split(",") if x.strip()]
    if len(target_ciks) < 2:
        raise ValueError("Provide at least 2 CIKs to compute overlaps.")

    print("\nDiscovering latest SEC 13F dataset links...")
    links = list_dataset_links(sec, max_items=10)
    if not links:
        raise RuntimeError("Could not find dataset links on the SEC page.")

    print("\nChoose dataset:")
    for i, (label, url) in enumerate(links, start=1):
        print(f"  {i:2d}) {label}  |  {url}")

    choice = input("\nEnter number (or press Enter for #1 = latest): ").strip()
    idx = 1 if choice == "" else int(choice)
    if idx < 1 or idx > len(links):
        raise ValueError("Invalid selection.")
    label, zip_url = links[idx - 1]
    print(f"\nSelected: {label}")

    out_dir = BASE_OUTPUT_DIR / OUT_FOLDER_NAME
    raw_dir = out_dir / "raw_extracted"
    raw_dir.mkdir(parents=True, exist_ok=True)

    zip_path = out_dir / "dataset_form13f.zip"
    print(f"\nDownloading ZIP to: {zip_path}")
    download_zip(sec, zip_url, zip_path)

    print(f"Extracting ZIP to: {raw_dir}")
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(raw_dir)

    submission_path = find_file_by_token(raw_dir, "SUBMISSION")
    infotable_path = find_file_by_token(raw_dir, "INFOTABLE")
    coverpage_path = None
    try:
        coverpage_path = find_file_by_token(raw_dir, "COVERPAGE")
    except FileNotFoundError:
        pass

    print(f"\nLoading SUBMISSION: {submission_path.name}")
    sub = load_submission(submission_path)

    # 1) pick latest filing per (CIK, period)
    picked_pp = pick_latest_accessions_per_period(sub, target_ciks)
    if picked_pp.empty:
        raise RuntimeError("No 13F-HR/13F-HR/A rows found for your CIKs in this dataset.")

    # 2) then pick last N periods per manager (CIK)
    picked = pick_last_n_periods_per_manager(picked_pp, AUTO_USE_LAST_N_PERIODS_PER_MANAGER)

    # If a manager has only 1 period in dataset, it will still remain; changes will skip it.
    accessions = set(picked["ACCESSION_NUMBER"].tolist())
    print(f"Selected {len(accessions)} accession(s) = last {AUTO_USE_LAST_N_PERIODS_PER_MANAGER} period(s) per manager where available.")

    cover = pd.DataFrame()
    if coverpage_path is not None:
        print(f"Loading COVERPAGE: {coverpage_path.name}")
        cover = load_coverpage(coverpage_path, accessions)

    print(f"Loading INFOTABLE (chunked): {infotable_path.name}")
    info = load_infotable_chunked(infotable_path, accessions)
    if info.empty:
        raise RuntimeError("INFOTABLE filter returned 0 rows — check if dataset includes holdings for your CIKs.")

    merged = info.merge(
        picked[["ACCESSION_NUMBER", "CIK", "SUBMISSIONTYPE", "FILING_DATE", "PERIODOFREPORT", "FILING_DATE_TS", "PERIODOFREPORT_TS"]],
        on="ACCESSION_NUMBER",
        how="left"
    )

    if not cover.empty and "FILINGMANAGER_NAME" in cover.columns:
        merged = merged.merge(
            cover[["ACCESSION_NUMBER", "FILINGMANAGER_NAME", "ISAMENDMENT", "AMENDMENTTYPE"]],
            on="ACCESSION_NUMBER",
            how="left"
        )
    else:
        merged["FILINGMANAGER_NAME"] = None
        merged["ISAMENDMENT"] = None
        merged["AMENDMENTTYPE"] = None

    merged["manager"] = merged.apply(
        lambda r: (r["FILINGMANAGER_NAME"] if isinstance(r["FILINGMANAGER_NAME"], str) and r["FILINGMANAGER_NAME"].strip()
                   else f"CIK{str(r['CIK']).zfill(10)}"),
        axis=1
    )

    # Base universe (option filter)
    base = merged.copy()
    if EXCLUDE_OPTIONS_FROM_OVERLAP and "PUTCALL" in base.columns:
        base = base[base["PUTCALL"].isna()]

    # Fund totals + weights (as filed unit cancels in weights)
    mgr_cusip_vals = (
        base.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "manager", "CUSIP"], dropna=False)["VALUE"]
        .sum(min_count=1)
        .rename("PositionValue_AsFiled")
        .reset_index()
    )

    fund_totals = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "manager"], dropna=False)["PositionValue_AsFiled"]
        .sum(min_count=1)
        .rename("Total13FValue_AsFiled")
        .reset_index()
    )

    mgr_cusip_vals = mgr_cusip_vals.merge(
        fund_totals, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "manager"], how="left"
    )
    mgr_cusip_vals["WeightPct"] = (mgr_cusip_vals["PositionValue_AsFiled"] / mgr_cusip_vals["Total13FValue_AsFiled"]) * 100

    labels = (
        base.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], dropna=False)[["NAMEOFISSUER", "TITLEOFCLASS"]]
        .first()
        .reset_index()
    )
    mgr_cusip_vals = mgr_cusip_vals.merge(labels, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="left")

    # -------------------------
    # OVERLAPS (per period)
    # -------------------------
    cusip_mgr_counts = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], dropna=False)["manager"]
        .nunique()
        .rename("ManagersHolding")
        .reset_index()
    )
    overlap_cusips = cusip_mgr_counts[cusip_mgr_counts["ManagersHolding"] >= OVERLAP_MIN_MANAGERS].copy()

    cusip_total_value = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], dropna=False)["PositionValue_AsFiled"]
        .sum(min_count=1)
        .rename("TotalValue_AsFiled_AllManagers")
        .reset_index()
    )

    all_overlaps = (
        overlap_cusips.merge(cusip_total_value, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="left")
        .merge(labels, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="left")
        .sort_values(["PERIODOFREPORT_TS", "ManagersHolding", "TotalValue_AsFiled_AllManagers"],
                     ascending=[False, False, False])
    )

    overlap_weights_long = (
        mgr_cusip_vals.merge(overlap_cusips, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="inner")
        .sort_values(["PERIODOFREPORT_TS", "ManagersHolding", "CUSIP", "WeightPct"],
                     ascending=[False, False, True, False])
    )

    overlap_weights_matrix = overlap_weights_long.pivot_table(
        index=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS", "ManagersHolding"],
        columns="manager",
        values="WeightPct",
        aggfunc="first"
    ).reset_index()

    # Pairwise overlap counts per period
    pairwise_frames = []
    for (period, period_ts), dfp in base.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS"], dropna=False):
        keys_by_mgr = {}
        for mgr, dfm in dfp.groupby("manager"):
            keys_by_mgr[mgr] = set(holding_key(dfm).dropna().tolist())
        pw = compute_pairwise(keys_by_mgr)
        if not pw.empty:
            pw.insert(0, "PERIODOFREPORT", period)
            pw.insert(1, "PERIODOFREPORT_TS", period_ts)
            pairwise_frames.append(pw)

    pairwise_overlap = pd.concat(pairwise_frames, ignore_index=True) if pairwise_frames else pd.DataFrame(
        columns=["PERIODOFREPORT", "PERIODOFREPORT_TS", "ManagerA", "ManagerB", "OverlapCount", "UnionCount", "Jaccard"]
    )

    # -------------------------
    # COMMON ALL (per period)
    # -------------------------
    common_all_rows = []
    for (period, period_ts), dfp in base.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS"], dropna=False):
        sets = []
        for _, g in dfp.groupby("manager"):
            sets.append(set(g["CUSIP"].dropna().astype(str).tolist()))
        common_cusips = set.intersection(*sets) if sets else set()
        for cusip in common_cusips:
            common_all_rows.append({"PERIODOFREPORT": period, "PERIODOFREPORT_TS": period_ts, "CUSIP": cusip})

    common_all_universe = pd.DataFrame(common_all_rows)
    if common_all_universe.empty:
        common_all = pd.DataFrame(columns=[
            "PERIODOFREPORT", "PERIODOFREPORT_TS", "manager", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS",
            "PositionValue_AsFiled", "Total13FValue_AsFiled", "WeightPct", "ManagersHolding"
        ])
    else:
        common_all = (
            mgr_cusip_vals.merge(common_all_universe, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="inner")
            .merge(cusip_mgr_counts, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="left")
            .sort_values(["PERIODOFREPORT_TS", "WeightPct"], ascending=[False, False])
        )

    # ============================================================
    # ADDITIONS 1–4 (idea generation)
    # ============================================================
    sec_stats = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS"], dropna=False)
        .agg(
            ManagersHolding=("manager", "nunique"),
            TotalValue_AllManagers_AsFiled=("PositionValue_AsFiled", "sum"),
            SumWeightPct=("WeightPct", "sum"),
            AvgWeightPct=("WeightPct", "mean"),
            MedianWeightPct=("WeightPct", "median"),
            MaxWeightPct=("WeightPct", "max"),
            MinWeightPct=("WeightPct", "min"),
        )
        .reset_index()
    )

    top_holder = (
        mgr_cusip_vals.sort_values(["PERIODOFREPORT_TS", "CUSIP", "WeightPct"], ascending=[False, True, False])
        .groupby(["PERIODOFREPORT_TS", "CUSIP"], dropna=False)
        .head(1)[["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "manager", "WeightPct"]]
        .rename(columns={"manager": "TopHolder", "WeightPct": "TopHolderWeightPct"})
    )
    sec_stats = sec_stats.merge(top_holder, on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"], how="left")

    tmp = mgr_cusip_vals.copy()
    tmp["SumWeight_forCUSIP"] = tmp.groupby(["PERIODOFREPORT_TS", "CUSIP"], dropna=False)["WeightPct"].transform("sum")
    tmp["w_norm"] = tmp["WeightPct"] / tmp["SumWeight_forCUSIP"]
    hhi = (
        tmp.groupby(["PERIODOFREPORT_TS", "CUSIP"], dropna=False)["w_norm"]
        .apply(lambda s: (s.fillna(0) ** 2).sum())
        .rename("WeightHHI")
        .reset_index()
    )
    sec_stats = sec_stats.merge(hhi, on=["PERIODOFREPORT_TS", "CUSIP"], how="left")

    sec_stats["ConsensusScore"] = (
        (sec_stats["ManagersHolding"] ** 0.5) * (sec_stats["AvgWeightPct"] + 0.5)
        + (sec_stats["SumWeightPct"] * 0.20)
    )
    sec_stats["NicheConvictionScore"] = (
        (sec_stats["MaxWeightPct"] * 1.00) * (sec_stats["WeightHHI"].fillna(0) + 0.10)
    )

    idea_ranker_overlaps = sec_stats[sec_stats["ManagersHolding"] >= OVERLAP_MIN_MANAGERS].copy()
    idea_ranker_overlaps = idea_ranker_overlaps.sort_values(
        ["PERIODOFREPORT_TS", "ConsensusScore", "NicheConvictionScore"],
        ascending=[False, False, False]
    )

    mgr_count_per_period = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS"], dropna=False)["manager"]
        .nunique().rename("ManagersInUniverse").reset_index()
    )
    idea_ranker_overlaps = idea_ranker_overlaps.merge(mgr_count_per_period, on=["PERIODOFREPORT", "PERIODOFREPORT_TS"], how="left")
    idea_ranker_overlaps["CrowdedThreshold"] = idea_ranker_overlaps["ManagersInUniverse"].apply(
        lambda n: max(CROWDED_MIN_ABS, int((n * CROWDED_MIN_FRAC) + 0.9999))
    )

    def bucket_row(r):
        crowded = r["ManagersHolding"] >= r["CrowdedThreshold"]
        high_conv = r["MaxWeightPct"] >= HIGH_CONVICTION_MAX_WEIGHT_PCT
        if crowded and high_conv:
            return "Consensus high conviction"
        if crowded and not high_conv:
            return "Crowded low conviction"
        if (not crowded) and high_conv:
            return "Niche high conviction"
        return "Other"

    idea_ranker_overlaps["IdeaBucket"] = idea_ranker_overlaps.apply(bucket_row, axis=1)

    idea_quadrants = idea_ranker_overlaps[[
        "PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS",
        "ManagersHolding", "ManagersInUniverse", "CrowdedThreshold",
        "AvgWeightPct", "MaxWeightPct", "SumWeightPct",
        "TopHolder", "TopHolderWeightPct",
        "WeightHHI", "ConsensusScore", "NicheConvictionScore",
        "IdeaBucket"
    ]].copy()

    # Weighted pairwise overlap by $ share of portfolio (per period)
    holdings_set = (
        mgr_cusip_vals.groupby(["PERIODOFREPORT_TS", "manager"], dropna=False)["CUSIP"]
        .apply(lambda s: set(s.dropna().astype(str))).to_dict()
    )
    totals_map = (
        fund_totals.set_index(["PERIODOFREPORT_TS", "manager"])["Total13FValue_AsFiled"].to_dict()
    )
    pos_map = (
        mgr_cusip_vals.set_index(["PERIODOFREPORT_TS", "manager", "CUSIP"])["PositionValue_AsFiled"].to_dict()
    )

    rows = []
    for period_ts in sorted(mgr_cusip_vals["PERIODOFREPORT_TS"].dropna().unique(), reverse=True):
        mgrs = sorted(mgr_cusip_vals[mgr_cusip_vals["PERIODOFREPORT_TS"] == period_ts]["manager"].dropna().unique())
        # recover period string
        per_str = mgr_cusip_vals.loc[mgr_cusip_vals["PERIODOFREPORT_TS"] == period_ts, "PERIODOFREPORT"].iloc[0]
        for i in range(len(mgrs)):
            for j in range(i + 1, len(mgrs)):
                a, b = mgrs[i], mgrs[j]
                Sa = holdings_set.get((period_ts, a), set())
                Sb = holdings_set.get((period_ts, b), set())
                inter = Sa & Sb
                if not inter:
                    continue
                a_total = totals_map.get((period_ts, a), 0) or 0
                b_total = totals_map.get((period_ts, b), 0) or 0
                a_overlap_val = sum(pos_map.get((period_ts, a, c), 0) or 0 for c in inter)
                b_overlap_val = sum(pos_map.get((period_ts, b, c), 0) or 0 for c in inter)
                rows.append({
                    "PERIODOFREPORT": per_str,
                    "PERIODOFREPORT_TS": period_ts,
                    "ManagerA": a,
                    "ManagerB": b,
                    "OverlapCount": len(inter),
                    "OverlapValue_AsFiled_A": a_overlap_val,
                    "OverlapValue_AsFiled_B": b_overlap_val,
                    "OverlapWeightShare_A_Pct": (a_overlap_val / a_total * 100) if a_total else 0.0,
                    "OverlapWeightShare_B_Pct": (b_overlap_val / b_total * 100) if b_total else 0.0,
                })

    pairwise_weighted_overlap = pd.DataFrame(rows)
    if not pairwise_weighted_overlap.empty:
        pairwise_weighted_overlap = pairwise_weighted_overlap.sort_values(
            ["PERIODOFREPORT_TS", "OverlapWeightShare_A_Pct", "OverlapWeightShare_B_Pct"],
            ascending=[False, False, False]
        )

    # Fund summary: overlap exposure + concentration (per period)
    overlap_flag = mgr_cusip_vals.merge(
        overlap_cusips[["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "ManagersHolding"]],
        on=["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP"],
        how="left"
    )
    overlap_flag["IsOverlapped"] = overlap_flag["ManagersHolding"].fillna(0) >= OVERLAP_MIN_MANAGERS

    fund_summary = (
        overlap_flag.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "manager"], dropna=False)
        .agg(
            NumPositions=("CUSIP", "nunique"),
            NumOverlappedPositions=("IsOverlapped", "sum"),
            OverlapWeightPct_Total=("WeightPct", lambda s: s[overlap_flag.loc[s.index, "IsOverlapped"]].sum()),
            LargestPositionWeightPct=("WeightPct", "max"),
        )
        .reset_index()
    )

    top10_conc = (
        overlap_flag.sort_values(["PERIODOFREPORT_TS", "manager", "WeightPct"], ascending=[False, True, False])
        .groupby(["PERIODOFREPORT_TS", "manager"], dropna=False)
        .head(10)
        .groupby(["PERIODOFREPORT_TS", "manager"], dropna=False)["WeightPct"]
        .sum()
        .rename("Top10ConcentrationPct")
        .reset_index()
    )
    fund_summary = fund_summary.merge(top10_conc, on=["PERIODOFREPORT_TS", "manager"], how="left")
    fund_summary = fund_summary.sort_values(["PERIODOFREPORT_TS", "OverlapWeightPct_Total"], ascending=[False, False])

    # Crowding summary
    crowded_holdings = (
        overlap_weights_long.groupby(["PERIODOFREPORT", "PERIODOFREPORT_TS", "CUSIP", "NAMEOFISSUER", "TITLEOFCLASS", "ManagersHolding"], dropna=False)
        .agg(
            TotalValue_AsFiled_AllManagers=("PositionValue_AsFiled", "sum"),
            AvgWeightPct=("WeightPct", "mean"),
            MedianWeightPct=("WeightPct", "median"),
            MinWeightPct=("WeightPct", "min"),
            MaxWeightPct=("WeightPct", "max"),
        )
        .reset_index()
        .sort_values(["PERIODOFREPORT_TS", "ManagersHolding", "AvgWeightPct", "TotalValue_AsFiled_AllManagers"],
                     ascending=[False, False, False, False])
    )

    # ============================================================
    # NEW: FUND CHANGES (latest vs previous per manager)
    # ============================================================
    fund_changes_long, fund_changes_summary = build_fund_changes(mgr_cusip_vals)

    # Also write a “periods_selected” tab for transparency
    periods_selected = picked.sort_values(["CIK", "PERIODOFREPORT_TS"], ascending=[True, False]).copy()
    periods_selected["CIK"] = periods_selected["CIK"].astype(str).str.zfill(10)

    # =========================
    # WRITE OUTPUT
    # =========================
    out_file = safe_excel_writer_path(out_dir, "13F_bulk_overlap_summary")

    with pd.ExcelWriter(out_file, engine="openpyxl") as writer:
        write_df_split(writer, periods_selected, "periods_selected")

        write_df_split(writer, pairwise_overlap, "pairwise_overlap")
        write_df_split(writer, pairwise_weighted_overlap, "pairwise_weighted")
        write_df_split(writer, all_overlaps, "all_overlaps")
        write_df_split(writer, overlap_weights_long, "overlap_weights_long")
        write_df_split(writer, overlap_weights_matrix, "overlap_weights_matrix")
        write_df_split(writer, common_all, "common_all")
        write_df_split(writer, crowded_holdings, "crowded_holdings")
        write_df_split(
            writer,
            fund_totals.sort_values(["PERIODOFREPORT_TS", "Total13FValue_AsFiled"], ascending=[False, False]),
            "fund_totals"
        )

        # 1–4 idea-gen
        write_df_split(writer, idea_ranker_overlaps, "idea_ranker")
        write_df_split(writer, idea_quadrants, "idea_quadrants")
        write_df_split(writer, fund_summary, "fund_summary")

        # NEW change tabs
        write_df_split(writer, fund_changes_long, "fund_changes_long")
        write_df_split(writer, fund_changes_summary, "fund_changes_summary")

        # Per-manager drilldown (small universes only)
        for mgr, dfm in mgr_cusip_vals.groupby("manager"):
            sheet = clean_sheet_name(mgr)
            dfm.sort_values(["PERIODOFREPORT_TS", "WeightPct"], ascending=[False, False]).to_excel(
                writer, sheet_name=sheet, index=False
            )

    print(f"\nSaved: {out_file}")
    print("\nUse these tabs for change detection:")
    print(" - fund_changes_long: NEW / EXIT / INCREASE / DECREASE / UNCHANGED per CUSIP per manager.")
    print(" - fund_changes_summary: counts + aggregate deltas per manager (latest vs previous).")
    print(" - periods_selected: exactly which two report periods were pulled per CIK.")
    print("\nIf the file name has a timestamp: you had the old workbook open in Excel.")


if __name__ == "__main__":
    interactive_bulk_13f()



Discovering latest SEC 13F dataset links...

Choose dataset:
   1) 2025 September October November 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01sep2025-30nov2025_form13f.zip
   2) 2025 June July August 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01jun2025-31aug2025_form13f.zip
   3) 2025 March April May 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01mar2025-31may2025_form13f.zip
   4) 2024 December 2025 January February 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01dec2024-28feb2025_form13f.zip
   5) 2024 September October November 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01sep2024-30nov2024_form13f.zip
   6) 2024 June July August 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01jun2024-31aug2024_form13f.zip
   7) 2024 March April May 13F  |  https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01mar20