In [None]:
%conda install --yes -c defaults -c conda-forge --update-all python=3.10.13 tqdm BeautifulSoup aiohttp aiodns bs4 cchardet Faker lxml openpyxl pandas PyPDF2 python-dateutil regex tenacity
%conda info

# SEC Constants update

In [2]:
from aiohttp import ClientSession
import json, regex

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
async def __constant_update():
    async with ClientSession(raise_for_status=True,headers=headers) as c:
        async with c.get("https://www.sec.gov/edgar/search/js/edgar_full_text_search.js") as res:
            _script = await res.text()

        with open("constants.py", "w", encoding="utf-8") as f:
            f.write("_FORMS = ")
            json.dump({
                form.pop("form"): form
                for form in eval(regex.search(
                    R"^const forms = (\[\r?\n(?: {4}\{.*?\},*\r?\n)*(?: {4}\{.*?\})\r?\n\])\.sort",
                    _script,
                    regex.MULTILINE
                )[1])
            }, f, indent=4)

            f.write('\n')

            f.write("_LOCATIONS = ")
            json.dump(dict(eval(regex.search(
                R"^const locationsArray = (\[\r?\n(?: {4}\[.*?\],\r?\n)*(?: {4}\[.*?\])\r?\n\]);",
                _script,
                regex.MULTILINE
            )[1])), f, indent=4)

            f.write('\n')
await __constant_update()

In [5]:
import asyncio
import json
import logging
from datetime import date
from itertools import zip_longest
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
from bs4 import BeautifulSoup
import pandas as pd
from aiohttp import ClientSession
from faker import Faker
import regex
from tenacity import retry
import time
from tqdm import *
from constants import _FORMS, _LOCATIONS
import os

_DISPLAY_NAME_REGEX = regex.compile(R"(.*) \(CIK (\d{10})\)", regex.V1)
_CC_REGEX = regex.compile(R"[\p{Cc}\p{Cf}]+", regex.V1)
_LOGGER = logging.getLogger(__name__)

@retry
async def fetch(semaphore,client,phrases,cik,end,forms,start='2001-01-01',range = 'custom',category= 'custom',entity=None): #'https://efts.sec.gov/LATEST/search-index? 
             
    q = " ".join(f"\"{phrase}\"" for phrase in phrases)
    #forms = " ".join(form for  form in forms)
    data = {'q':q,
            'startdt':start,
            'enddt':end,
            'ciks':cik,
            'dataRange':'custom',
            'category':'custom',
            'forms':forms}
    url = 'https://efts.sec.gov/LATEST/search-index'
    async with semaphore,client.request(method='get',url=url,params = data) as res:
        #await asyncio.sleep(0.1)
        if res.status == 200:
            result = await res.read()
            print(result)
            return result#await res.json()
        raise ValueError(f"Status Code = {res.status}")

def _concat_to_url(cik: str, adsh: str, filename: str) -> str:
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh}/{filename}"

@retry
async def _download(semaphore: asyncio.Semaphore, url: str, client):
    t0 = time.time()
    async with semaphore, client.get(url) as res:
        #await asyncio.sleep(1)
        if res.ok:
            content = await res.text()
            return content#content.decode('utf-8')
        else:
            raise ValueError(f"Status code : {res.status}")

def _parse_display_name(s: str, cik: str):
    if s is not None and (m := _DISPLAY_NAME_REGEX.fullmatch(s)):
        if (scik := m[2]) != cik:
            _LOGGER.warning(f"mismatched CIK: {scik} (parsed from \"{s}\") v.s. {cik}")
        return m[1], scik
    return s, cik

def _parse_hit(hit: Dict[str, Any]): 
    _id = hit["_id"]
    source = hit["_source"]
    adsh, filename = _id.split(':')
    filename_main, filename_ext = filename.rsplit('.', 1)
    xsl = source["xsl"]
    
    if xsl and filename_ext.lower() == "xml":
        filename_main = f"{xsl}/{filename_main}"
    filename = f"{filename_main}.{filename_ext}"

    file_nums = source["file_num"]
    film_nums = source["film_num"]
    rows = pd.DataFrame((
        [_id, *_parse_display_name(display_name, cik), loc.split(",")[0], _LOCATIONS.get(code), file_num, film_num]
        for display_name, cik, loc, code, file_num, film_num in zip_longest(
            source["display_names"],
            source["ciks"],
            source["biz_locations"],
            source["biz_states"], #source["inc_states"] if source["inc_states"] else 
            file_nums if isinstance(file_nums, list) else [file_nums] if file_nums else (),
            film_nums if isinstance(film_nums, list) else [film_nums] if film_nums else ()
        )
    ), columns=["id", "entity_name", "cik", "located", "incorporated", "file_num", "film_num"], copy=False)#, dtype=str
    form = source["form"]
    root_form = source["root_form"]
    form_title = ""
    if root_form in _FORMS:
        form_title = f" ({_FORMS[root_form]['title']})"
    file_type = source["file_type"]
    if not file_type:
        file_type = source["file_description"]
    if not file_type:
        file_type = filename
    ciks = rows.loc[0,"cik"]
    info = pd.DataFrame({
        "entity_name":rows['entity_name'],
        "id": _id,
        "form_file": f"{form}{form_title}{'' if form == file_type else f' {file_type}'}",
        "file_date": source["file_date"],
        "period_ending": source.get("period_ending", None),
        "file_ext": filename_ext,
        "url": _concat_to_url(ciks, adsh.replace('-', ''), filename),
        "parser": None#getattr(parsers, f"_parse_{filename_ext.lower()}", None)
    },copy=False,dtype=str)#, dtype=object
    
    result = pd.merge(rows,info,how="left",on="id")
    #result.drop_duplicates(inplace=True)
    #del result["id"]
    return result.reset_index(drop=True)


def extract_paragraphs(html, keywords):
    soup = BeautifulSoup(html, 'html.parser')
    #extracted_text = soup.get_text(strip=True)
    matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all(['p', 'div', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    if any(keyword in paragraph.get_text() for keyword in keywords)
]
    return list(set(matching_paragraphs))

def CIK(file):
    with open(file, "r", encoding="UTF-8") as f:
        try:
            _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
            return _ciks
        except IOError as e:
                raise ValueError(f"{file} is not a valid file") from e
    # if _ciks:
    #     for i in range(0, len(_ciks), ciks_per_query):
    #         yield _ciks[i:i + ciks_per_query]
def decode(byte):
    total_hits = json.loads(byte.decode('utf-8'))['hits']['total']['value']
    # if total_hits == 0:
    #     return None
    hits =  json.loads(byte.decode('utf-8'))["hits"]["hits"]
    #print(hits)
    return hits

async def main(_PHRASES,_FILING_TYPES,_DATE_START,_DATE_END,_CIKS_PER_QUERY, _CIKS,_buffer_chunk_size,df):
    semaphore = asyncio.Semaphore(10)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
    #-------------- Crawl --------
    async with ClientSession(raise_for_status=True, headers=headers) as client :
        #------Fetch--------
        total = len(_CIKS)
        with tqdm(            unit="B",            # 1
          # 4,  # 5
            total=total) as fetch_bar:
            print("Starting fetch...")
            for index in range(0,total,_CIKS_PER_QUERY):
                ciks = _CIKS[index:min(index+_CIKS_PER_QUERY,len(_CIKS))]
                fetch_tasks = [
                    asyncio.create_task(fetch(
                        semaphore=semaphore,
                        client=client,
                        phrases=_PHRASES,
                        cik=cik,
                        start=_DATE_START,
                        end=_DATE_END,
                        forms=form
                    ))
                    for  form in _FILING_TYPES for cik in ciks
                ]
                fetched_data = await asyncio.gather(*fetch_tasks)

                df = pd.concat([df] + 
                        [_parse_hit(hit) for data in fetched_data for hit in decode(data)],
                        ignore_index=True)
                await asyncio.sleep(1)
                fetch_bar.update(_CIKS_PER_QUERY)
            #df.set_index(keys="id",inplace=True)
            df.drop_duplicates(subset="id",inplace=True)
            df.reset_index(drop=True,inplace=True)
        #-----Docs download-------
        print(f"fetch completed and collected [{df.shape[0]}] of docs,starting download docs..")
        paser_df = pd.DataFrame()
        paser_df['id'] = df['id']
        total = df.shape[0]
        with tqdm(total=total) as download_bar:
            for index in range(0,df.shape[0],_buffer_chunk_size):
                index_range = list(range(index,min(index+_buffer_chunk_size,df.shape[0])))
                download_tasks = [
                    asyncio.create_task(_download(
                        semaphore=semaphore,
                        client=client,
                        url= df.loc[row,'url'])
                    )for row in index_range 
                    ]
                downloaded = await asyncio.gather(*download_tasks)
                
                for row in index_range:
                    for docs in range(len(downloaded)):
                        paragraphs = extract_paragraphs(downloaded[docs],_PHRASES)
                        for num in range(len(paragraphs)):
                            tem = f"paragrah{num + 1}"
                            paser_df.loc[row,tem] = str(paragraphs[num])
                await asyncio.sleep(1)
                download_bar.update(_buffer_chunk_size)

    df = pd.merge(df,paser_df,how='left',on="id")
    del df["parser"] 
    del df['id']
    df.to_excel(f"{str(date.today())}.xlsx")   
    print(f"Data have been export at {os.getcwd()}\{str(date.today())}.xlsx") 
    return df


In [4]:
import time
import warnings
warnings.filterwarnings("ignore")
T0 = time.time()

_PHRASES = ["data breach","cyber security"]

_FILING_TYPES = ["10-K","10-Q"]#,

_DATE_START = "2001-01-01"
_DATE_END = "2023-12-12"
_CIKS_PER_QUERY = 10
_buffer_chunk_size = 20
#_CIKS =  ["0001653482"] #Input from a list or a path,,"0001653481"
_CIKS = CIK(Path("sample_input_file.txt"))
df = pd.DataFrame()
df = await main(
    _PHRASES,
    _FILING_TYPES,
    _DATE_START,
    _DATE_END,
    _CIKS_PER_QUERY,
    _CIKS,
    _buffer_chunk_size,
    df
)
 
END = time.time()
print("--"*10,
      f"All tasks completed! Time Cost:{round((END-T0)/60,1)} minutes ",sep='\n')

  0%|          | 0/2423 [00:00<?, ?B/s]

Starting fetch...


  0%|          | 0/2423 [00:15<?, ?B/s]


CancelledError: 

In [3]:
url = "https://www.sec.gov/Archives/edgar/data/1653482/000162828022008836/gtlb-20220131.htm"
semaphore = asyncio.Semaphore(10)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
text = ""
content = ""
#headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
async with ClientSession(raise_for_status=True, headers=headers) as client :
    t0 = time.time()
    async with semaphore, client.get(url) as res:
        await asyncio.sleep(1)
        if res.ok:
            print("ok")
            content = await res.read()
            txt = await res.text()
            print(f"cost {round(time.time()-t0,2)} seconds")
            #print(content.decode('utf-8'))
        else:
            raise ValueError(f"Status code : {res.status}")

ok
cost 1.52 seconds


In [4]:
assert text == content

AssertionError: 

In [14]:
content



In [15]:
txt



In [23]:
pap = extract_paragraphs(content,["data breach"])
pap[0]

'CCPA provides for civil penalties for violations, as well as a private right of action for security breaches that may increase the likelihood of, and the risks associated with, security breach litigation. Additionally, in November 2020, California passed the California Privacy Rights Act, or the CPRA, which expands the CCPA significantly, including by expanding consumers’ rights with respect to certain personal information and creating a new state agency to oversee implementation and enforcement efforts, potentially resulting in further uncertainty and requiring us to incur additional costs and expenses in an effort to comply. Many of the CPRA’s provisions will become effective on January 1, 2023. Further, Virginia enacted the Virginia Consumer Data Protection Act, or the CDPA, another comprehensive state privacy law, that will also be effective January 1, 2023. The CCPA, CPRA, and CDPA may increase our compliance costs and potential liability, particularly in the event of a data brea

In [32]:
df.shape

(0, 0)