In [1]:
# !pip install openai beautifulsoup4 pandas requests


In [2]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict
import html
import unicodedata
import json
from openai import AzureOpenAI
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ========== CONFIG ==========
CIK = "0001355790"  # ISCO
FORM_TYPES = ["8-K", "10-K", "10-Q"]
HEADERS = {"User-Agent": "boyangzhu03@gmail.com"}
BASE_URL = "https://www.sec.gov"

# Save to Drive
DATA_PATH = "/content/drive/MyDrive/SEC"
os.makedirs(DATA_PATH, exist_ok=True)

# ========== Azure OpenAI Credentials ==========
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-12-01-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://extendmeopenaieastus2.openai.azure.com"
os.environ["AZURE_OPENAI_API_KEY"] = "Your.Key"
AZURE_DEPLOYMENT_NAME = "o4-mini"

client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
)



In [4]:
def fetch_filing_urls(cik):
    try:
        index_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        response = requests.get(index_url, headers=HEADERS)
        response.raise_for_status()
        data = response.json()["filings"]["recent"]

        urls = []
        for i in range(len(data["form"])):
            if data["form"][i] in FORM_TYPES:
                accession = data["accessionNumber"][i].replace("-", "")
                primary_doc = data["primaryDocument"][i]
                if accession and primary_doc:
                    url = f"{BASE_URL}/Archives/edgar/data/{int(cik)}/{accession}/{primary_doc}"
                    urls.append((data["form"][i], data["filingDate"][i], url))
        return urls
    except Exception as e:
        print(f"[ERROR] Fetching filings: {e}")
        return []

def clean_text(text):
    text = html.unescape(text)
    text = unicodedata.normalize("NFKD", text)
    return re.sub(r"[\x80-\xFF]+", "", text)

In [5]:

def extract_events_llm(content, max_chars=12000):
    try:
        # Truncate content if too long to avoid timeouts
        if len(content) > max_chars:
            content = content[:max_chars]

        prompt = f"""You are an expert in financial and biotech disclosures. Read the following SEC filing content.

              Extract a list of distinct events (clinical, corporate, regulatory, or IP-related) mentioned in the text.
              For each event, return:
              - A short description (1–2 sentences).
              - A subtype: one of "clinical", "corporate", "regulatory", or "ip".

              Format your answer as JSON:
              [
                {{
                  "description": "...",
                  "subtype": "..."
                }},
                ...
              ]

              Filing content:
              \"\"\"
              {content}
              \"\"\"
              """

        response = client.chat.completions.create(
            model=AZURE_DEPLOYMENT_NAME,
            messages=[
                {"role": "system", "content": "You extract structured events from SEC filings."},
                {"role": "user", "content": prompt}
            ],
            max_completion_tokens=1500  # Slightly reduced for speed
        )

        reply = response.choices[0].message.content.strip()

        # Try parsing as JSON
        try:
            return json.loads(reply)
        except json.JSONDecodeError:
            print("[LLM ERROR] JSON parsing failed. Skipping this result.")
            return []

    except Exception as e:
        print(f"[LLM ERROR] Failed to extract events: {e}")
        return []


In [6]:
def main():
    filing_urls = fetch_filing_urls(CIK)
    print(f"[INFO] Found {len(filing_urls)} filings")

    all_events = []
    for form_type, filing_date, url in filing_urls:
        print(f"[INFO] Parsing {form_type} from {filing_date}")
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            content = clean_text(soup.get_text(" ", strip=True))
            events = extract_events_llm(content)

            if not events:
                print(f"[INFO] No valid events extracted from {filing_date}, skipping.")
                continue  # Skip to next filing

            for e in events:
                all_events.append({
                    "timestamp": filing_date,
                    "form_type": form_type,
                    "event_type": "sec_disclosure",
                    "event_subtype": e.get("subtype", "unknown"),
                    "description": e.get("description", ""),
                    "source_url": url,
                    "confidence": 0.9
                })

        except Exception as e:
            print(f"[ERROR] Parsing filing: {e}")

    df = pd.DataFrame(all_events)
    output_path = os.path.join(DATA_PATH, "isco_events_llm_V2.csv")
    df.to_csv(output_path, index=False)
    print(f"\n\u2705 Saved {len(df)} events to {output_path}")


In [7]:
main()


[INFO] Found 225 filings
[INFO] Parsing 10-Q from 2025-05-14
[INFO] No valid events extracted from 2025-05-14, skipping.
[INFO] Parsing 10-K from 2025-03-28
[LLM ERROR] JSON parsing failed. Skipping this result.
[INFO] No valid events extracted from 2025-03-28, skipping.
[INFO] Parsing 10-Q from 2024-11-13
[INFO] No valid events extracted from 2024-11-13, skipping.
[INFO] Parsing 8-K from 2024-09-17
[INFO] Parsing 10-Q from 2024-08-13
[INFO] Parsing 10-Q from 2024-05-14
[INFO] Parsing 10-K from 2024-03-28
[INFO] No valid events extracted from 2024-03-28, skipping.
[INFO] Parsing 10-Q from 2023-11-13
[INFO] Parsing 8-K from 2023-09-18
[INFO] Parsing 10-Q from 2023-08-11
[INFO] No valid events extracted from 2023-08-11, skipping.
[INFO] Parsing 8-K from 2023-06-21
[INFO] Parsing 10-Q from 2023-05-12
[INFO] Parsing 10-K from 2023-03-30
[INFO] No valid events extracted from 2023-03-30, skipping.
[INFO] Parsing 8-K from 2023-03-16
[INFO] Parsing 10-Q from 2022-11-14
[INFO] No valid events e