In [None]:
import requests
import json
import time
import csv
import spacy
from bs4 import BeautifulSoup
import ollama

Extracted: the Securities Exchange Act | AAPL | 2025-02-25 | entity.</p></div
Extracted: N#LH)UECP6F8HM(3PHH(5'=]MI | AAPL | 2025-01-30 | entity.</p></div
Extracted: M!C"R#Y)8/U>@GQY>OC_\[Y6!/X1VA6I[AN | AAPL | 2025-01-03 | entity.</p></div
Extracted: the Securities Exchange Act | AAPL | 2024-10-31 | entity.</p></div
Extracted: OX![9X]NG7BOKFT\'VJ;?D'WNZ | AAPL | 2024-09-10 | entity.</p></div
Extracted: CFR | MSFT | 2025-01-29 | entity.</p></div
Extracted: CFR | MSFT | 2025-01-22 | \XDV6F60>9(9 0
MSP
Extracted: HP%R3?-J+!@BXT;.L\ABOS2 | MSFT | 2024-12-11 | QS\(::^@=W)\T#L[/_MP00[.>E_.>MV+X[/3%6!PJ5%WVI1[*%2QQ9:4
Extracted: Income Tax Liabilities | MSFT | 2024-12-03 | contextRef="C_64a494b0-c801-4baa-908b-757fe16d266e
Extracted: GI | MSFT | 2024-10-30 | entity.</p></div
Extracted: Plan | NVDA | 2025-03-07 | entity.</p></div
Extracted: AVWELO | NVDA | 2025-02-26 | entity.</p></div
Extracted: CFR | NVDA | 2025-01-17 | entity.</p></div
Extracted: Data Center.</font></div><div style="margin

In [None]:
# Load NLP model and set max length
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1_500_000  # Avoid memory error

# Function to get CIK from Ticker
def get_cik_from_ticker(ticker):
    url = "https://www.sec.gov/files/company_tickers.json"
    headers = {"User-Agent": "Larissa Iacobescu (la641245@ucf.edu)"}  
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error {response.status_code}: Could not fetch CIK.")
        return None

    data = response.json()
    ticker_to_cik = {str(info["ticker"]).upper(): str(info["cik_str"]).zfill(10) for info in data.values()}
    
    return ticker_to_cik.get(ticker.upper())

In [None]:
# Function to get 8-K Filings
def get_8k_filings(cik, count=5):
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    headers = {"User-Agent": "Larissa Iacobescu (la641245@ucf.edu)"}
    
    response = requests.get(url, headers=headers)
    time.sleep(1)

    if response.status_code != 200:
        print(f"Error {response.status_code}: Could not fetch filings.")
        return []

    data = response.json()
    if "filings" not in data or "recent" not in data["filings"]:
        print("Error: Invalid SEC data format.")
        return []

    recent_filings = data["filings"]["recent"]
    filings = [
        {
            "filing_date": recent_filings["filingDate"][i],
            "accession_number": recent_filings["accessionNumber"][i],
            "filing_url": f"https://www.sec.gov/Archives/edgar/data/{cik}/{recent_filings['accessionNumber'][i].replace('-', '')}/{recent_filings['accessionNumber'][i]}.txt"
        }
        for i in range(len(recent_filings["form"]))
        if recent_filings["form"][i] == "8-K"
    ][:count]

    return filings

In [None]:
# Function to fetch text from 8-K filing
def fetch_filing_text(cik, accession_number):
    url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{accession_number}.txt"
    headers = {"User-Agent": "Larissa Iacobescu (la641245@ucf.edu)"}

    response = requests.get(url, headers=headers)
    time.sleep(1)

    if response.status_code == 200:
        return response.text
    else:
        print(f"Error fetching filing {accession_number}: {response.status_code}")
        return None

In [None]:
# Function to extract entities (Company Name & Product) using NER
def extract_entities(text, chunk_size=100000):
    company_names = set()
    product_names = set()

    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        doc = nlp(chunk)

        for ent in doc.ents:
            if ent.label_ == "ORG":
                company_names.add(ent.text)
            elif ent.label_ == "PRODUCT":
                product_names.add(ent.text)

    return (next(iter(company_names), None), next(iter(product_names), None))

In [None]:
# Function to summarize product description using DeepSeek
def generate_summary(text):
    prompt = f"Summarize the following product description in less than 180 characters:\n\n{text}"
    
    response = ollama.chat(model="deepseek-r1:1.5b", messages=[{"role": "user", "content": prompt}])
    
    return response['message']['content'] if response else "No summary available."

In [None]:
# Function to save extracted data to CSV
def save_to_csv(data, filename="sec_8k_filings2.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Company Name", "Stock Name", "Filing Time", "New Product", "Product Description"])
        writer.writerows(data)

# List of tickers
tickers = ["AAPL", "MSFT", "NVDA", "GOOGL", "AMZN", "TSLA", "META", "NFLX", "AMD", "INTC", 
           "ORCL", "IBM", "CSCO", "PYPL", "ADBE", "CRM", "UBER", "LYFT", "SNAP", "SQ", 
           "SHOP", "TWLO", "RBLX", "BABA", "DIS", "PEP", "KO", "MCD", "SBUX", "NKE",
           "PFE", "JNJ", "MRNA", "LLY", "BMY", "GILD", "REGN", "CVS", "UNH", "VRTX",
           "XOM", "CVX", "BP", "TTE", "COP", "SLB", "HAL", "F", "GM", "TSM", "ASML",
           "QCOM", "AVGO", "TXN", "NXPI", "LRCX", "AMAT", "MU", "STX", "WDC", "V",
           "MA", "AXP", "JPM", "GS", "BAC", "C", "MS", "WFC", "T", "VZ", "TMUS",
           "BA", "LMT", "NOC", "RTX", "GD", "HON", "CAT", "DE", "MMM", "GE",
           "PG", "CL", "KMB", "EL", "WMT", "TGT", "COST", "HD", "LOW", "BBY", "DG", "DLTR"]

# Processing multiple tickers
structured_data = []

for ticker in tickers[:100]:  # Limit to 100 tickers
    cik = get_cik_from_ticker(ticker)
    if not cik:
        print(f"Skipping {ticker}, CIK not found.")
        continue

    filings = get_8k_filings(cik, count=5)
    for filing in filings:
        filing_text = fetch_filing_text(cik, filing["accession_number"])
        if not filing_text:
            continue

        company, product = extract_entities(filing_text)
        if company and product:
            summary = generate_summary(filing_text)
            structured_data.append([company, ticker, filing["filing_date"], product, summary])
            print(f"Extracted: {company} | {ticker} | {filing['filing_date']} | {product}")

# Save to CSV
save_to_csv(structured_data)
print("✅ CSV file saved successfully!")