In [None]:
"""
Pipeline for ESG Reporting Classifier (SEC filings + BERT).
"""

from pathlib import Path
import requests
import pandas as pd

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, SEC_EDGAR_API_KEY


def fetch_sec_filings(cik: str, form_type: str = "10-K", limit: int = 5) -> Path:
    """
    Placeholder for SEC EDGAR fetch.

    In practice, you'll use SEC's API or RSS/JSON endpoints to pull recent filings.
    For now: assume you've downloaded text/HTML into data/raw_filings.
    """
    return RAW_DATA_DIR / "raw_filings_index.csv"


def run_pipeline():
    """
    Steps:

    1. Collect a sample of filings (10-K, 20-F) from SEC Edgar.
    2. Parse out the business / ESG sections (Item 1, risk factors, etc.).
    3. Label text spans as Environment/Social/Governance (rule-based + manual seed).
    4. Tokenize and fine-tune a BERT classifier on ESG labels.
    5. Score new filings; compute ESG disclosure intensity and authenticity metrics.
    6. Aggregate by company/sector â†’ export for Tableau (heatmaps, rankings).
    """
    filings_index = fetch_sec_filings("0000320193")  # Apple CIK as example

    # TODO:
    # index_df = pd.read_csv(filings_index)
    # texts = load_and_parse_filings(index_df)
    # labeled = build_esg_training_set(texts)
    # model = train_bert_classifier(labeled)
    # scores = score_filings(model, texts)
    #
    # PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    # scores.to_csv(PROCESSED_DATA_DIR / "esg_scores.csv", index=False)

    print("ESG classifier pipeline outline executed.")


if __name__ == "__main__":
    run_pipeline()