In [None]:
from aiohttp import ClientSession
import json, regex

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
async def __constant_update():
    async with ClientSession(raise_for_status=True,headers=headers) as c:
        async with c.get("https://www.sec.gov/edgar/search/js/edgar_full_text_search.js") as res:
            _script = await res.text()

        with open("constants.py", "w", encoding="utf-8") as f:
            f.write("_FORMS = ")
            json.dump({
                form.pop("form"): form
                for form in eval(regex.search(
                    R"^const forms = (\[\r?\n(?: {4}\{.*?\},*\r?\n)*(?: {4}\{.*?\})\r?\n\])\.sort",
                    _script,
                    regex.MULTILINE
                )[1])
            }, f, indent=4)

            f.write('\n')

            f.write("_LOCATIONS = ")
            json.dump(dict(eval(regex.search(
                R"^const locationsArray = (\[\r?\n(?: {4}\[.*?\],\r?\n)*(?: {4}\[.*?\])\r?\n\]);",
                _script,
                regex.MULTILINE
            )[1])), f, indent=4)

            f.write('\n')
await __constant_update()

In [1]:
import asyncio
import json
import logging
from datetime import date
from itertools import zip_longest
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
from bs4 import BeautifulSoup
import pandas as pd
from aiohttp import ClientSession
from faker import Faker
import regex
from tenacity import retry
import time
from tqdm import *
from constants import _FORMS, _LOCATIONS
import os

_DISPLAY_NAME_REGEX = regex.compile(R"(.*) \(CIK (\d{10})\)", regex.V1)
_CC_REGEX = regex.compile(R"[\p{Cc}\p{Cf}]+", regex.V1)
_LOGGER = logging.getLogger(__name__)

@retry
async def fetch(fetch_bar,semaphore,client,phrases,cik,end,forms,start='2001-01-01',range = 'custom',category= 'custom',entity=None,): #'https://efts.sec.gov/LATEST/search-index? 
             
    q = " ".join(f"\"{phrase}\"" for phrase in phrases)
    #forms = " ".join(form for  form in forms)
    data = {'q':q,
            'startdt':start,
            'enddt':end,
            'ciks':cik,
            'dataRange':'custom',
            'category':'custom',
            'forms':forms}
    url = 'https://efts.sec.gov/LATEST/search-index'
    async with semaphore,client.request(method='get',url=url,params = data) as res:
        await asyncio.sleep(1)
        if res.status == 200:
            result = await res.read()
            fetch_bar.update(1)
            return result#await res.json()
        raise ValueError(f"Status Code = {res.status}")

def _concat_to_url(cik: str, adsh: str, filename: str) -> str:
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh}/{filename}"

@retry
# async def _download(semaphore: asyncio.Semaphore, url: str, client):
#     t0 = time.time()
#     async with semaphore, client.get(url) as res:
#         #await asyncio.sleep(1)
#         if res.ok:
#             print("ok")
#             content = await res.read()
#             return content#content.decode('utf-8')
#         else:
#             raise ValueError(f"Status code : {res.status}")
async def _download(semaphore: asyncio.Semaphore, row_index, df,client,keywords,download_bar):
    url = df.loc[row_index,"url"]
    async with semaphore, client.get(url) as res:
        #await asyncio.sleep(1)
        if res.ok:
            html = await res.read()
            #return content#content.decode('utf-8')
        else:
            raise ValueError(f"Status code : {res.status}")
        paragraphs = extract_paragraphs(html,keywords)
        for num in range(len(paragraphs)):
            tem = f"paragrah{num + 1}"
            df.loc[row_index,tem] = str(paragraphs[num])
        download_bar.update(1)

def _parse_display_name(s: str, cik: str):
    if s is not None and (m := _DISPLAY_NAME_REGEX.fullmatch(s)):
        if (scik := m[2]) != cik:
            _LOGGER.warning(f"mismatched CIK: {scik} (parsed from \"{s}\") v.s. {cik}")
        return m[1], scik
    return s, cik

def _parse_hit(hit: Dict[str, Any]): 
    _id = hit["_id"]
    source = hit["_source"]
    adsh, filename = _id.split(':')
    filename_main, filename_ext = filename.rsplit('.', 1)
    xsl = source["xsl"]
    
    if xsl and filename_ext.lower() == "xml":
        filename_main = f"{xsl}/{filename_main}"
    filename = f"{filename_main}.{filename_ext}"

    file_nums = source["file_num"]
    film_nums = source["film_num"]
    rows = pd.DataFrame((
        [_id, *_parse_display_name(display_name, cik), loc.split(",")[0], _LOCATIONS.get(code), file_num, film_num]
        for display_name, cik, loc, code, file_num, film_num in zip_longest(
            source["display_names"],
            source["ciks"],
            source["biz_locations"],
            source["biz_states"], #source["inc_states"] if source["inc_states"] else 
            file_nums if isinstance(file_nums, list) else [file_nums] if file_nums else (),
            film_nums if isinstance(film_nums, list) else [film_nums] if film_nums else ()
        )
    ), columns=["id", "entity_name", "cik", "located", "incorporated", "file_num", "film_num"], copy=False)#, dtype=str
    form = source["form"]
    root_form = source["root_form"]
    form_title = ""
    if root_form in _FORMS:
        form_title = f" ({_FORMS[root_form]['title']})"
    file_type = source["file_type"]
    if not file_type:
        file_type = source["file_description"]
    if not file_type:
        file_type = filename
    ciks = rows.loc[0,"cik"]
    info = pd.DataFrame({
        "entity_name":rows['entity_name'],
        "id": _id,
        "form_file": f"{form}{form_title}{'' if form == file_type else f' {file_type}'}",
        "file_date": source["file_date"],
        "period_ending": source.get("period_ending", None),
        "file_ext": filename_ext,
        "url": _concat_to_url(ciks, adsh.replace('-', ''), filename),
        "parser": None#getattr(parsers, f"_parse_{filename_ext.lower()}", None)
    },copy=False,dtype=str)#, dtype=object
    
    result = pd.merge(rows,info,how="left",on="id")
    #result.drop_duplicates(inplace=True)
    #del result["id"]
    return result.reset_index(drop=True)


def extract_paragraphs(html, keywords):
    soup = BeautifulSoup(html, 'html.parser')
    #extracted_text = soup.get_text(strip=True)
    matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all(['p'])#, 'div', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
    if any(keyword in paragraph.get_text() for keyword in keywords)
]
    return list(set(matching_paragraphs))

def CIK(file):
    with open(file, "r", encoding="UTF-8") as f:
        try:
            _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
            return _ciks
        except IOError as e:
                raise ValueError(f"{file} is not a valid file") from e
    # if _ciks:
    #     for i in range(0, len(_ciks), ciks_per_query):
    #         yield _ciks[i:i + ciks_per_query]
def decode(byte):
    total_hits = json.loads(byte.decode('utf-8'))['hits']['total']['value']
    # if total_hits == 0:
    #     return None
    hits =  json.loads(byte.decode('utf-8'))["hits"]["hits"]
    #print(hits)
    return hits

async def main(_PHRASES,_FILING_TYPES,_DATE_START,_DATE_END,_CIKS_PER_QUERY, _CIKS,_buffer_chunk_size,df):
    semaphore = asyncio.Semaphore(10)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
    #-------------- Crawl --------
    async with ClientSession(raise_for_status=True, headers=headers) as client :
        #------Fetch--------
        total = len(_CIKS) * len(_FILING_TYPES)
        with tqdm(        
            total=total) as fetch_bar:
            print("Starting fetch...")
            #for index in range(0,total,_CIKS_PER_QUERY):
                #ciks = _CIKS[index:min(index+_CIKS_PER_QUERY,len(_CIKS))]
                # fetch_tasks = [
                #     asyncio.create_task(fetch(
                #         semaphore=semaphore,
                #         client=client,
                #         phrases=_PHRASES,
                #         cik=cik,
                #         start=_DATE_START,
                #         end=_DATE_END,
                #         forms=form
                #     ))
                #     for  form in _FILING_TYPES for cik in ciks
                # ]
                # fetched_data = await asyncio.gather(*fetch_tasks)

            fetch_tasks = [
                    asyncio.create_task(fetch(
                        semaphore=semaphore,
                        client=client,
                        phrases=_PHRASES,
                        cik=cik,
                        start=_DATE_START,
                        end=_DATE_END,
                        forms=form,
                        fetch_bar=fetch_bar
                    ))
                    for  form in _FILING_TYPES for cik in _CIKS
                ]
            fetched_data = await asyncio.gather(*fetch_tasks)
            df = pd.concat([df] + 
                        [_parse_hit(hit) for data in fetched_data for hit in decode(data)],
                        ignore_index=True)
                #await asyncio.sleep(1)
            #fetch_bar.update(_CIKS_PER_QUERY)
            #df.set_index(keys="id",inplace=True)
            df.drop_duplicates(subset="id",inplace=True)
            df.reset_index(drop=True,inplace=True)
        #-----Docs download-------
        print(f"fetch completed and collected [{df.shape[0]}] of docs,starting download docs..")
        #paser_df = pd.DataFrame()
        #paser_df['id'] = df['id']
        #paser_df['url'] = df['url']
        # total = df.shape[0]
        # with tqdm(total=total) as download_bar:
        #     for index in range(0,total):
        #         index_range = list(range(index,min(index+_buffer_chunk_size,df.shape[0])))
        #         download_tasks = [
        #             asyncio.create_task(_download(
        #                 semaphore=semaphore,
        #                 client=client,
        #                 df = df,
        #                 row_index=row,
        #                 download_bar=download_bar,
        #                 keywords=_PHRASES)
        #             )for row in index_range 
        #             ]
        #         downloaded = await asyncio.gather(*download_tasks)
                

    #df = pd.merge(df,paser_df,how='left',on="id")
    del df["parser"] 
    del df['id']
    df.to_excel(f"{str(date.today())}.xlsx")   
    print(f"Data have been export at {os.getcwd()}\{str(date.today())}.xlsx") 
    return df


In [2]:
import time
import warnings
warnings.filterwarnings("ignore")
T0 = time.time()

_PHRASES = ["data ","cyber"]

_FILING_TYPES = ["10-K","10-Q"]#,

_DATE_START = "2001-01-01"
_DATE_END = "2023-12-12"
_CIKS_PER_QUERY = 10
_buffer_chunk_size = 20
#_CIKS =  ["0001653482"] #Input from a list or a path,,"0001653481"
_CIKS = CIK(Path("sample_input_file.txt"))
df = pd.DataFrame()
df = await main(
    _PHRASES,
    _FILING_TYPES,
    _DATE_START,
    _DATE_END,
    _CIKS_PER_QUERY,
    _CIKS,
    _buffer_chunk_size,
    df
)
 
END = time.time()
print("--"*10,
      f"All tasks completed! Time Cost:{round((END-T0)/60,1)} minutes ",sep='\n')#pymupdf

  0%|          | 0/4846 [00:00<?, ?it/s]

Starting fetch...


 74%|███████▍  | 3601/4846 [18:36<05:51,  3.55it/s] 

In [3]:
df

NameError: name 'df' is not defined

In [3]:
@retry
async def _download(semaphore: asyncio.Semaphore, row_index, df,client,keywords):
    url = df.loc[row_index,"url"]
    print(url)
    async with semaphore, client.get(url) as res:
        #await asyncio.sleep(1)
        if res.ok:
            print("ok")
            html = await res.read()
            #return content#content.decode('utf-8')
        else:
            raise ValueError(f"Status code : {res.status}")
        try:
            paragraphs = extract_paragraphs(html,keywords)
            for num in range(len(paragraphs)):
                tem = f"paragrah{num + 1}"
                df.loc[row_index,tem] = str(paragraphs[num].strip())
        except:
            raise TypeError()
        #return html

def extract_paragraphs(html, keywords):
    soup = BeautifulSoup(html, 'html.parser')
    #extracted_text = soup.get_text(strip=True)
    matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all(['p'])
    if any(keyword in paragraph.get_text() for keyword in keywords)
]
    return matching_paragraphs
semaphore = asyncio.Semaphore(10)
url = "https://www.sec.gov/Archives/edgar/data/0000724571/000161577416004959/s102961_10k.htm"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
df = pd.DataFrame()
df.loc[0,'url'] = "https://www.sec.gov/Archives/edgar/data/798528/000119312517077690/d310159d10k.htm"
async with  ClientSession(raise_for_status=True, headers=headers) as client:
   html = await _download(semaphore=semaphore,client=client,row_index=0,df=df,keywords=["data breach","cyber security"])

  df.loc[0,'url'] = "https://www.sec.gov/Archives/edgar/data/798528/000119312517077690/d310159d10k.htm"


https://www.sec.gov/Archives/edgar/data/798528/000119312517077690/d310159d10k.htm
ok




 TABLE OF CONTENTS 
 










 
 
 
  
Page
 


 
PART I
  




Item 1.
 
 Business
  
 
1
 

Item 1A.
 
 Risk Factors
  
 
5
 

Item 1B.
 
 Unresolved Staff Comments
  
 
10
 

Item 2.
 
 Properties
  
 
10
 

Item 3.
 
 Legal Proceedings
  
 
10
 

Item 4.
 
 Mine Safety Disclosures
  
 
10
 






 
PART II
  




Item 5.
 
 Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer
Purchases of Equity Securities
  
 
11
 

Item 6.
 
 Selected Financial Data
  
 
11
 

Item 7.
 
 Management’s Discussion and Analysis of Financial Condition and Results of
 Operations
  
 
12
 

Item 7A.
 
 Quantitative and Qualitative Disclosures About Market Risk
  
 
26
 

Item 8.
 
 Financial Statements and Supplementary Data
  
 
26
 

Item 9.
 
 Changes in and Disagreements With Accountants on Accounting and Financial Disclosure

  
 
26
 

Item 9A.
 
 Controls and Procedures
  
 
26
 

Item 9B.
 
 Other Information
  
 
26
 






 
PART III
  




Item 10.
 
 Directors, E

In [4]:
df

Unnamed: 0,url,paragrah1,paragrah2,paragrah3,paragrah4,paragrah5,paragrah6,paragrah7,paragrah8,paragrah9,paragrah10,paragrah11,paragrah12,paragrah13
0,https://www.sec.gov/Archives/edgar/data/798528...,Table of Contents\n\n\n TABLE OF CONTENTS \n \...,Table of Contents\nAs used in this Annual Repo...,Table of Contents\nOur capabilities allow us t...,Table of Contents\n\nidentity of the ship may ...,"Table of Contents\n\nMonaco Financial, LLC, a ...",Table of Contents\nCost of Environmental Compl...,Table of Contents\nOur business involves a hig...,Table of Contents\n\nvaluable cargo located an...,Table of Contents\nWe depend on key employees ...,Table of Contents\n\nto profit from a decline ...,Table of Contents\nOur insurance coverage may ...,We may be exposed to cyber security risks.,We depend on information technology networks a...


In [7]:
semaphore = asyncio.Semaphore(10)
url = "https://www.sec.gov/Archives/edgar/data/798528/000119312517077690/d310159d10k.htm"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
async def _download(semaphore: asyncio.Semaphore, url: str, client):
    t0 = time.time()
    async with semaphore, client.get(url) as res:
        #await asyncio.sleep(1)
        if res.ok:
            content = await res.read()
            return content#content.decode('utf-8')
        else:
            raise ValueError(f"Status code : {res.status}")
def extract_paragraphs(html, keywords):
    soup = BeautifulSoup(html, 'html.parser')
    #extracted_text = soup.get_text(strip=True)
    matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all('p')
    if any(keyword in paragraph.get_text() for keyword in keywords)
]
    return matching_paragraphs
async with ClientSession(raise_for_status = True, headers= headers) as c:
    html = await _download(semaphore,url,c)

#para = extract_paragraphs(html,["data breach","cyber security"])
#

In [10]:
soup = BeautifulSoup(html, 'html.parser')
matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all('p')
    if any(keyword in paragraph.get_text() for keyword in ["data breach","cyber security"])
]
matching_paragraphs

['\n\nTable of Contents\n\n\n TABLE OF CONTENTS \n\xa0\n\n\n\n\n\n\n\n\n\n\n\xa0\n\xa0\n\xa0\n\xa0\xa0\nPage\n\xa0\n\n\n\xa0\nPART I\n\xa0\xa0\n\n\n\n\nItem\xa01.\n\xa0\n Business\n\xa0\xa0\n\xa0\n1\n\xa0\n\nItem\xa01A.\n\xa0\n Risk Factors\n\xa0\xa0\n\xa0\n5\n\xa0\n\nItem 1B.\n\xa0\n Unresolved Staff Comments\n\xa0\xa0\n\xa0\n10\n\xa0\n\nItem 2.\n\xa0\n Properties\n\xa0\xa0\n\xa0\n10\n\xa0\n\nItem 3.\n\xa0\n Legal Proceedings\n\xa0\xa0\n\xa0\n10\n\xa0\n\nItem 4.\n\xa0\n Mine Safety Disclosures\n\xa0\xa0\n\xa0\n10\n\xa0\n\n\n\n\n\n\n\xa0\nPART II\n\xa0\xa0\n\n\n\n\nItem 5.\n\xa0\n Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer\nPurchases of Equity Securities\n\xa0\xa0\n\xa0\n11\n\xa0\n\nItem 6.\n\xa0\n Selected Financial Data\n\xa0\xa0\n\xa0\n11\n\xa0\n\nItem 7.\n\xa0\n Management’s Discussion and Analysis of Financial Condition and Results of\n Operations\n\xa0\xa0\n\xa0\n12\n\xa0\n\nItem 7A.\n\xa0\n Quantitative and Qualitative Disclosures About Market 

In [1]:
html

NameError: name 'html' is not defined