In [1]:
import os, re, time, json
from datetime import datetime, date
from typing import Optional, Dict, Any, List, Iterable
import requests

# ⚠️ 본인 정보로 바꾸기 (예: "홍길동 MySecNotebook/1.0 (your.email@example.com)")
USER_AGENT = "kangtae MySecNotebook/1.0 (honeypipeline@gmail.com)"

SEC_BASE = "https://data.sec.gov"
ARCHIVES_BASE = "https://www.sec.gov/Archives/edgar/data"

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": USER_AGENT,
    "Accept-Encoding": "gzip, deflate",
})

def _rate_limit_sleep():
    # SEC 서버 예의상 과도한 요청 방지 (너무 빠르면 429/403 발생)
    time.sleep(0.3)

def _parse_date(d: str) -> date:
    if d.lower() == "today":
        return date.today()
    return datetime.strptime(d, "%Y-%m-%d").date()

def zero_pad_cik(cik_str: str) -> str:
    digits = re.sub(r"\D", "", cik_str)
    return digits.zfill(10)

In [2]:
def get_cik_from_ticker(ticker: str) -> str:
    """
    SEC 제공 티커→CIK 매핑 사용.
    """
    url = "https://www.sec.gov/files/company_tickers.json"
    _rate_limit_sleep()
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    data = r.json()
    lookup = {entry["ticker"].lower(): str(entry["cik_str"]) for entry in data.values()}
    cik = lookup.get(ticker.lower())
    if not cik:
        raise ValueError(f"Ticker not found in SEC mapping: {ticker}")
    return zero_pad_cik(cik)

def get_all_submissions(cik10: str) -> Dict[str, Any]:
    """
    submissions/CIK##########.json + 연도별 history JSON을 합쳐
    form, accessionNumber, primaryDocument, filingDate 배열을 반환.
    """
    url = f"{SEC_BASE}/submissions/CIK{cik10}.json"
    _rate_limit_sleep()
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    root = r.json()

    combined = {
        "form": list(root.get("filings", {}).get("recent", {}).get("form", [])),
        "accessionNumber": list(root.get("filings", {}).get("recent", {}).get("accessionNumber", [])),
        "primaryDocument": list(root.get("filings", {}).get("recent", {}).get("primaryDocument", [])),
        "filingDate": list(root.get("filings", {}).get("recent", {}).get("filingDate", [])),
    }

    # 과거 연도별 history 파일 합치기
    files = root.get("filings", {}).get("files", [])
    for f in files:
        name = f.get("name")
        if not name:
            continue
        url_hist = f"{SEC_BASE}/submissions/{name}"
        _rate_limit_sleep()
        rr = SESSION.get(url_hist, timeout=30)
        rr.raise_for_status()
        hist = rr.json()
        for key in ["form", "accessionNumber", "primaryDocument", "filingDate"]:
            combined[key].extend(hist.get(key, []))
    return combined

def iter_filings_in_range(cik10: str, forms: Iterable[str], start_d: date, end_d: date):
    """
    날짜/양식(10-Q/10-K 등) 필터를 적용하여 순회.
    """
    combined = get_all_submissions(cik10)
    for f, acc, pd, fd in zip(combined["form"], combined["accessionNumber"],
                              combined["primaryDocument"], combined["filingDate"]):
        if f not in forms:
            continue
        try:
            fdate = datetime.strptime(fd, "%Y-%m-%d").date()
        except Exception:
            continue
        if start_d <= fdate <= end_d:
            yield {
                "form": f,
                "accessionNumber": acc,
                "primaryDocument": pd,
                "filingDate": fd,
                "cik10": cik10
            }

In [None]:
def list_archives_files(cik10: str, accession_number: str) -> List[Dict[str, Any]]:
    """
    각 공시 폴더의 index.json으로 파일 목록 조회.
    """
    cik_int = int(cik10)
    acc_nodash = accession_number.replace("-", "")
    url = f"{ARCHIVES_BASE}/{cik_int}/{acc_nodash}/index.json"
    _rate_limit_sleep()
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    return r.json().get("directory", {}).get("item", [])

def _size_int(it) -> int:
    # size가 "", None, 비숫자여도 안전하게 0으로 처리
    s = it.get("size", 0)
    try:
        return int(s)
    except Exception:
        return 0

def filter_items(items, kind: str):
    if kind == "all":
        return items

    # 메인 문서 후보 (.htm/.html/.txt) — css/js 제외
    primary_candidates = [
        it for it in items
        if it.get("name","").lower().endswith((".htm", ".html", ".txt"))
           and not it.get("name","").lower().endswith((".css", ".js"))
    ]

    if kind == "primary":
        if primary_candidates:
            # 🔧 여기서 안전한 정렬 키 사용
            return sorted(primary_candidates, key=_size_int, reverse=True)[:1]
        return []

    if kind == "xbrl":
        sel = []
        name_lower = lambda it: it.get("name","").lower()

        # 전통 XBRL/보조 파일
        xbrl_like_exts = (".xml", ".xsd", ".zip", ".json")
        for it in items:
            nm = name_lower(it)
            if nm.endswith(xbrl_like_exts) or nm == "filingsummary.xml":
                sel.append(it)

        # iXBRL HTML도 포함
        for it in items:
            nm = name_lower(it)
            if nm.endswith((".htm",".html")) and not nm.endswith((".css",".js")):
                sel.append(it)

        # 중복 제거
        seen, dedup = set(), []
        for it in sel:
            nm = it.get("name","")
            if nm not in seen:
                dedup.append(it); seen.add(nm)
        sel = dedup

        # 폴백
        if not sel and primary_candidates:
            sel = sorted(primary_candidates, key=_size_int, reverse=True)[:1]
        return sel

    return items

def download_file(item: dict, out_dir: str, cik10: str, accession_number: str):
    """
    SEC index.json 항목을 받아 파일을 저장.
    - item['href']가 없으면 Archives 경로를 CIK/ACC/파일명으로 직접 구성
    """
    name = item.get("name")
    href = item.get("href")

    if not name and not href:
        return None

    if not href:
        # href가 없을 때 직접 만든다
        cik_int = int(cik10)
        acc_nodash = accession_number.replace("-", "")
        href = f"/Archives/edgar/data/{cik_int}/{acc_nodash}/{name}"

    url = href if href.startswith("http") else f"https://www.sec.gov{href}"

    os.makedirs(out_dir, exist_ok=True)
    _rate_limit_sleep()  # 또는 _sleep()
    resp = SESSION.get(url, stream=True, timeout=60)
    if not resp.ok:
        print(f"[DOWNLOAD FAIL] {url} -> {resp.status_code}")
        return None

    local_path = os.path.join(out_dir, os.path.basename(href))
    with open(local_path, "wb") as f:
        for chunk in resp.iter_content(8192):
            if chunk:
                f.write(chunk)
    return local_path

In [4]:
# 기간/형식/저장옵션
START = "2021-01-01"
END = "today"
FORMS = "10-Q,10-K"      # 예: "10-Q"만 또는 "10-K"만도 가능
# KIND = "xbrl"            # "xbrl" | "primary" | "all"
KIND = "primary" 
OUT_ROOT = "./edgar_data"

In [5]:
TICKER = "AAPL"    # 또는 CIK = "0000320193"
CIK = None

forms = [s.strip() for s in FORMS.split(",") if s.strip()]
start_d = _parse_date(START)
end_d = _parse_date(END)

cik10 = zero_pad_cik(CIK) if CIK else get_cik_from_ticker(TICKER)
print("Resolved CIK:", cik10)

rows = []
for meta in iter_filings_in_range(cik10, forms, start_d, end_d):
    rows.append((meta["filingDate"], meta["form"], meta["accessionNumber"], meta["primaryDocument"]))
rows_sorted = sorted(rows, key=lambda x: x[0])
print(f"총 {len(rows_sorted)}건 발견")
for r in rows_sorted:
    print(r) # 여기까지 보면 재무재표 데이터 다 나옴.

Resolved CIK: 0000320193
총 19건 발견
('2021-01-28', '10-Q', '0000320193-21-000010', 'aapl-20201226.htm')
('2021-04-29', '10-Q', '0000320193-21-000056', 'aapl-20210327.htm')
('2021-07-28', '10-Q', '0000320193-21-000065', 'aapl-20210626.htm')
('2021-10-29', '10-K', '0000320193-21-000105', 'aapl-20210925.htm')
('2022-01-28', '10-Q', '0000320193-22-000007', 'aapl-20211225.htm')
('2022-04-29', '10-Q', '0000320193-22-000059', 'aapl-20220326.htm')
('2022-07-29', '10-Q', '0000320193-22-000070', 'aapl-20220625.htm')
('2022-10-28', '10-K', '0000320193-22-000108', 'aapl-20220924.htm')
('2023-02-03', '10-Q', '0000320193-23-000006', 'aapl-20221231.htm')
('2023-05-05', '10-Q', '0000320193-23-000064', 'aapl-20230401.htm')
('2023-08-04', '10-Q', '0000320193-23-000077', 'aapl-20230701.htm')
('2023-11-03', '10-K', '0000320193-23-000106', 'aapl-20230930.htm')
('2024-02-02', '10-Q', '0000320193-24-000006', 'aapl-20231230.htm')
('2024-05-03', '10-Q', '0000320193-24-000069', 'aapl-20240330.htm')
('2024-08-02',

In [None]:
os.makedirs(OUT_ROOT, exist_ok=True)
count = 0
manifests = []

for meta in iter_filings_in_range(cik10, forms, start_d, end_d):
    cik_int = int(cik10)
    acc_no = meta["accessionNumber"]
    acc_nodash = acc_no.replace("-", "")
    subdir = os.path.join(OUT_ROOT, f"{TICKER or CIK}_{cik_int}_{acc_nodash}_{meta['form']}_{meta['filingDate']}")
    items = list_archives_files(cik10, acc_no)
    chosen = filter_items(items, kind=KIND)
    print("chosen count:", len(chosen))
    for it in chosen[:5]:
        print(it)   # name, href, size 확인
    saved_files = []
    for it in chosen:
        p = download_file(it, subdir, cik10=cik10, accession_number=acc_no)
        if p:
            saved_files.append(os.path.basename(p))

    # ✅ 디렉토리 생성 추가
    os.makedirs(subdir, exist_ok=True)
    
    manifest = {
        **meta,
        "symbol": (TICKER or CIK),
        "saved_files": saved_files,
        "output_dir": os.path.abspath(subdir),
        "kind": KIND
    }
    with open(os.path.join(subdir, "manifest.json"), "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)
    manifests.append(manifest)
    count += 1
    print(f"[{TICKER or CIK}] {count:02d} | {meta['filingDate']} {meta['form']} → {len(saved_files)} files")

# 요약 파일
with open(os.path.join(OUT_ROOT, f"{TICKER or CIK}_summary.json"), "w", encoding="utf-8") as f:
    json.dump({"results": manifests}, f, ensure_ascii=False, indent=2)

print(f"\n완료: 총 {len(manifests)}건. 요약: {os.path.join(OUT_ROOT, f'{TICKER or CIK}_summary.json')}")

chosen count: 1
{'last-modified': '2025-08-01 06:00:42', 'name': 'aapl-20250628.htm', 'type': 'text.gif', 'size': '888048'}
[AAPL] 01 | 2025-08-01 10-Q → 0 files
chosen count: 1
{'last-modified': '2025-05-02 06:00:46', 'name': 'aapl-20250329.htm', 'type': 'text.gif', 'size': '889977'}
[AAPL] 02 | 2025-05-02 10-Q → 0 files
chosen count: 1
{'last-modified': '2025-01-31 06:01:27', 'name': 'aapl-20241228.htm', 'type': 'text.gif', 'size': '732589'}
[AAPL] 03 | 2025-01-31 10-Q → 0 files
chosen count: 1
{'last-modified': '2024-11-01 06:01:36', 'name': 'aapl-20240928.htm', 'type': 'text.gif', 'size': '1503780'}
[AAPL] 04 | 2024-11-01 10-K → 0 files
chosen count: 1
{'last-modified': '2024-08-01 18:03:34', 'name': 'aapl-20240629.htm', 'type': 'text.gif', 'size': '889136'}
[AAPL] 05 | 2024-08-02 10-Q → 0 files
chosen count: 1
{'last-modified': '2024-05-02 18:04:25', 'name': 'aapl-20240330.htm', 'type': 'text.gif', 'size': '884846'}
[AAPL] 06 | 2024-05-03 10-Q → 0 files
chosen count: 1
{'last-modi