In [6]:
# Backend logic for BSE raw JSON dump, importable in a notebook
import json
import random
import time
from datetime import datetime, timedelta
from pathlib import Path

import requests
from dateutil.parser import parse as dtparse

# HTTP headers and API endpoint template
HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Origin": "https://www.bseindia.com",
    "Referer": "https://www.bseindia.com/",
}
URL = (
    "https://api.bseindia.com/BseIndiaAPI/api/AnnSubCategoryGetData/w"
    "?pageno={page}&strCat=-1&strPrevDate={from_}&strScrip={scrip}"
    "&strSearch=P&strToDate={to_}&strType=C&subcategory=-1"
)


def year_chunks(start_date, end_date):
    """
    Yield (start, end) pairs in ~1-year chunks between two dates.
    """
    cur = start_date
    one_year = timedelta(days=365)
    while cur <= end_date:
        nxt = min(cur + one_year - timedelta(days=1), end_date)
        yield cur, nxt
        cur = nxt + timedelta(days=1)


def fetch_chunk(session, scrip, d_from, d_to):
    """
    Retrieve all pages of announcements for a given scrip code
    between d_from and d_to (inclusive).
    Returns a list of raw JSON payloads.
    """
    data, page = [], 1
    while True:
        url = URL.format(
            page=page,
            from_=d_from.strftime("%Y%m%d"),
            to_=d_to.strftime("%Y%m%d"),
            scrip=scrip,
        )
        response = session.get(url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        payload = response.json()
        if not payload.get("Table"):
            break
        data.append(payload)
        page += 1
    return data

def dump_raw_payloads(payloads, outfile):
    """
    Append newline-delimited JSON payloads to the given file path,
    each pretty-printed with indentation.
    """
    with outfile.open("a", encoding="utf-8") as f:
        for p in payloads:
            json.dump(p, f, ensure_ascii=False, indent=2)
            f.write("\n")


def run(map_str, start_str, end_str=None, output_dir_name="output"):
    """
    Programmatic entrypoint:
      - map_str: comma-separated "ISIN=scripCode" pairs
      - start_str: YYYY-MM-DD
      - end_str: YYYY-MM-DD (defaults to today if None)
      - output_dir_name: directory to write JSON files into
    Returns a list of log strings.
    """
    # Parse dates
    start_date = dtparse(start_str).date()
    end_date = dtparse(end_str).date() if end_str else datetime.now().date()
    if start_date > end_date:
        raise ValueError("start date must be <= end date")

    # Parse the mapping string
    mapping = {}
    for pair in map_str.split(","):
        if "=" not in pair:
            raise ValueError(f"Invalid mapping entry: '{pair}'")
        isin, scrip = pair.split("=", 1)
        mapping[isin.strip()] = scrip.strip()

    # Prepare output directory and HTTP session
    output_dir = Path(output_dir_name)
    output_dir.mkdir(exist_ok=True)
    session = requests.Session()
    logs = []

    # Fetch and dump data in year-long chunks per ISIN
    for isin, scrip in mapping.items():
        outfile = output_dir / f"{isin}.json"
        # Overwrite old file if exists
        if outfile.exists():
            outfile.unlink()
        for chunk_start, chunk_end in year_chunks(start_date, end_date):
            payloads = fetch_chunk(session, scrip, chunk_start, chunk_end)
            dump_raw_payloads(payloads, outfile)
            entry = (
                f"{isin} ({scrip}) {chunk_start:%Y-%m-%d}–{chunk_end:%Y-%m-%d} "
                f"→ {outfile.name}: {len(payloads)} pages"
            )
            print(entry)
            logs.append(entry)
            time.sleep(random.uniform(1, 14))

    print(f"Finished; raw dumps are in ./{output_dir_name}/")
    return logs

In [8]:
map_str   = "INE002A01018=500325,INE062A01020=500112"
start_str = "2014-01-01"
end_str   = "2025-01-01"  # or None to default to today


In [9]:

log = run(map_str, start_str, end_str)


INE002A01018 (500325) 2014-01-01–2014-12-31 → INE002A01018.json: 8 pages
INE002A01018 (500325) 2015-01-01–2015-12-31 → INE002A01018.json: 7 pages
INE002A01018 (500325) 2016-01-01–2016-12-30 → INE002A01018.json: 2 pages
INE002A01018 (500325) 2016-12-31–2017-12-30 → INE002A01018.json: 3 pages
INE002A01018 (500325) 2017-12-31–2018-12-30 → INE002A01018.json: 4 pages
INE002A01018 (500325) 2018-12-31–2019-12-30 → INE002A01018.json: 6 pages
INE002A01018 (500325) 2019-12-31–2020-12-29 → INE002A01018.json: 6 pages
INE002A01018 (500325) 2020-12-30–2021-12-29 → INE002A01018.json: 9 pages
INE002A01018 (500325) 2021-12-30–2022-12-29 → INE002A01018.json: 9 pages
INE002A01018 (500325) 2022-12-30–2023-12-29 → INE002A01018.json: 9 pages
INE002A01018 (500325) 2023-12-30–2024-12-28 → INE002A01018.json: 9 pages
INE002A01018 (500325) 2024-12-29–2025-01-01 → INE002A01018.json: 1 pages
INE062A01020 (500112) 2014-01-01–2014-12-31 → INE062A01020.json: 2 pages
INE062A01020 (500112) 2015-01-01–2015-12-31 → INE06