Configuration i

---
Use pip to install the necessary packages instead of using conda

In [None]:
%pip install -r requirements.txt

Configuration ii

---
Following Section only need to be run at the first time

In [None]:
from aiohttp import ClientSession
import json, regex

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
async def __constant_update():
    async with ClientSession(raise_for_status=True,headers=headers) as c:
        async with c.get("https://www.sec.gov/edgar/search/js/edgar_full_text_search.js") as res:
            _script = await res.text()

        with open("constants.py", "w", encoding="utf-8") as f:
            f.write("_FORMS = ")
            json.dump({
                form.pop("form"): form
                for form in eval(regex.search(
                    R"^const forms = (\[\r?\n(?: {4}\{.*?\},*\r?\n)*(?: {4}\{.*?\})\r?\n\])\.sort",
                    _script,
                    regex.MULTILINE
                )[1])
            }, f, indent=4)

            f.write('\n')

            f.write("_LOCATIONS = ")
            json.dump(dict(eval(regex.search(
                R"^const locationsArray = (\[\r?\n(?: {4}\[.*?\],\r?\n)*(?: {4}\[.*?\])\r?\n\]);",
                _script,
                regex.MULTILINE
            )[1])), f, indent=4)

            f.write('\n')
await __constant_update()

In [3]:
import asyncio
import json
import logging
from datetime import date
from itertools import zip_longest
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
from bs4 import BeautifulSoup
import pandas as pd
from aiohttp import ClientSession
from faker import Faker
import regex
from tenacity import retry
import time
from tqdm import *
from constants import _FORMS, _LOCATIONS
import os
import io
import fitz
_DISPLAY_NAME_REGEX = regex.compile(R"(.*) \(CIK (\d{10})\)", regex.V1)
_CC_REGEX = regex.compile(R"[\p{Cc}\p{Cf}]+", regex.V1)
_LOGGER = logging.getLogger(__name__)

@retry
async def fetch(fetch_bar,semaphore,client,phrases,cik,end,forms,start='2001-01-01',range = 'custom',category= 'custom',entity=None,): #'https://efts.sec.gov/LATEST/search-index? 
             
    q = " ".join(f"\"{phrase}\"" for phrase in phrases)
    #forms = " ".join(form for  form in forms)
    data = {'q':q,
            'startdt':start,
            'enddt':end,
            'ciks':cik,
            'dataRange':'custom',
            'category':'custom',
            'forms':forms}
    url = 'https://efts.sec.gov/LATEST/search-index'
    async with semaphore,client.request(method='get',url=url,params = data) as res:
        await asyncio.sleep(1)
        if res.status == 200:
            result = await res.read()
            fetch_bar.update(1)
            return result#await res.json()
        raise ValueError(f"Status Code = {res.status}")

def _concat_to_url(cik: str, adsh: str, filename: str) -> str:
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh}/{filename}"

@retry
async def _download(semaphore: asyncio.Semaphore, row_index, df,client,keywords,download_bar):
    url = df.loc[row_index,"url"]
    ext = df.loc[row_index,"file_ext"]
    async with semaphore, client.get(url) as res:
        await asyncio.sleep(1)
        if res.ok:
            #print("ok")
            html = await res.read()
        if ext == "htm":
            paragraphs = extract_html(html,keywords)
        elif ext == "pdf":
            paragraphs = extract_pdf(html,keywords)
        elif ext == "txt":
            paragraphs = extract_txt(html,keywords)
        else:
            tem = f"paragrah{1}"
            df.loc[row_index,tem] = 'Unknow extension'
            res.raise_for_status()
            return download_bar.update(1)
        if paragraphs:
            for num in range(len(paragraphs)):
                tem = f"paragrah{num + 1}"
                df.loc[row_index,tem] = str(paragraphs[num])
            return download_bar.update(1)
        _LOGGER.warning(f"{url} file will be skipped: ({res.status}) {res.reason}")
        tem = f"paragrah{1}"
        df.loc[row_index,tem] = 'Download skipped'
        return download_bar.update(1)

def _parse_display_name(s: str, cik: str):
    if s is not None and (m := _DISPLAY_NAME_REGEX.fullmatch(s)):
        if (scik := m[2]) != cik:
            _LOGGER.warning(f"mismatched CIK: {scik} (parsed from \"{s}\") v.s. {cik}")
        return m[1], scik
    return s, cik

def _parse_hit(hit: Dict[str, Any]): 
    _id = hit["_id"]
    source = hit["_source"]
    adsh, filename = _id.split(':')
    filename_main, filename_ext = filename.rsplit('.', 1)
    xsl = source["xsl"]
    
    if xsl and filename_ext.lower() == "xml":
        filename_main = f"{xsl}/{filename_main}"
    filename = f"{filename_main}.{filename_ext}"

    file_nums = source["file_num"]
    film_nums = source["film_num"]
    rows = pd.DataFrame((
        [_id, *_parse_display_name(display_name, cik), str(loc).split(",")[0], _LOCATIONS.get(code), file_num, film_num]
        for display_name, cik, loc, code, file_num, film_num in zip_longest(
            source["display_names"],
            source["ciks"],
            source["biz_locations"],
            source["biz_states"], #source["inc_states"] if source["inc_states"] else 
            file_nums if isinstance(file_nums, list) else [file_nums] if file_nums else (),
            film_nums if isinstance(film_nums, list) else [film_nums] if film_nums else ()
        ) 
    ), columns=["id", "entity_name", "cik", "located", "incorporated", "file_num", "film_num"], copy=False)#, dtype=str
    form = source["form"]
    root_form = source["root_form"]
    form_title = ""
    if root_form in _FORMS:
        form_title = f" ({_FORMS[root_form]['title']})"
    file_type = source["file_type"]
    if not file_type:
        file_type = source["file_description"]
    if not file_type:
        file_type = filename
    ciks = rows.loc[0,"cik"]
    info = pd.DataFrame({
        "entity_name":rows['entity_name'],
        "id": _id,
        "form_file": f"{form}{form_title}{'' if form == file_type else f' {file_type}'}",
        "file_date": source["file_date"],
        "period_ending": source.get("period_ending", None),
        "file_ext": filename_ext,
        "url": _concat_to_url(ciks, adsh.replace('-', ''), filename),
        "parser": None#getattr(parsers, f"_parse_{filename_ext.lower()}", None)
    },copy=False,dtype=str)#, dtype=object
    
    result = pd.merge(rows,info,how="left",on="id")
    #result.drop_duplicates(inplace=True)
    #del result["id"]
    return result.reset_index(drop=True)


def extract_html(html, keywords):
    soup = BeautifulSoup(html, "lxml")#'html.parser'
    #extracted_text = soup.get_text(strip=True)
    matching_paragraphs = [
    paragraph.get_text()
    for paragraph in soup.find_all(True)#["tr","td","span","p","table"]
    if any(keyword in paragraph.get_text() for keyword in keywords ) and len(paragraph.get_text())<=10000
]
    return list(set(matching_paragraphs))

def extract_pdf(fetch,keywords):
    pdf_stream = io.BytesIO(fetch)
    #pattern = regex.compile(r'(\n\W*\n)', regex.V1)
    pdf_document = fitz.open(stream=pdf_stream,filetype="pdf")
    line_separator_pattern = regex.compile(r'(\n\W*?\n)')
    potential_line_separators = list()

    for page in range(pdf_document.page_count):
        text = pdf_document[page].get_text("text")
        matches = line_separator_pattern.findall(text)
        potential_line_separators.extend(matches)

    # Use the most common line separator as the final choice
    if potential_line_separators:
        line_separator = max(potential_line_separators, key=potential_line_separators.count)
    else:
        # Default to the original separator if none is found
        line_separator = "\n  \n"
    total = [para for page in range(pdf_document.page_count) 
             for para in regex.split(line_separator, pdf_document[page].get_text("text")) 
             if any(keyword in para for keyword in keywords)]
    return total
def extract_txt(bytes,keywords):
    txt = bytes.decode('utf-8').split('\n\n')
    txt = [i.replace("\n"," ")for i in txt]
    matching_paragraphs = [paragraph for paragraph in txt if any(keyword in paragraph.lower() for keyword in keywords)]
    return matching_paragraphs

def CIK(file):
    with open(file, "r", encoding="UTF-8") as f:
        try:
            _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
            return _ciks
        except IOError as e:
                raise ValueError(f"{file} is not a valid file") from e
    # if _ciks:
    #     for i in range(0, len(_ciks), ciks_per_query):
    #         yield _ciks[i:i + ciks_per_query]
def decode(byte):
    total_hits = json.loads(byte.decode('utf-8'))['hits']['total']['value']
    # if total_hits == 0:
    #     return None
    hits =  json.loads(byte.decode('utf-8'))["hits"]["hits"]
    #print(hits)
    return hits



async def main(_PHRASES,_FILING_TYPES,_DATE_START,_DATE_END,_CIKS_PER_QUERY, _CIKS,_buffer_chunk_size,df):
    semaphore = asyncio.Semaphore(10)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
    if _FILING_TYPES == []:
        _FILING_TYPES = [""]
    #-------------- Crawl --------
    async with ClientSession(raise_for_status=True, headers=headers) as client :
        #------Fetch--------
        total = len(_CIKS) * len(_FILING_TYPES)
        with tqdm(        
            total=total) as fetch_bar:
            print("Starting fetch...")
            fetch_tasks = [
                    asyncio.create_task(fetch(
                        semaphore=semaphore,
                        client=client,
                        phrases=_PHRASES,
                        cik=cik,
                        start=_DATE_START,
                        end=_DATE_END,
                        forms=form,
                        fetch_bar=fetch_bar
                    ))
                    for  form in _FILING_TYPES for cik in _CIKS
                ]
            fetched_data = await asyncio.gather(*fetch_tasks)
            df = pd.concat([df] + 
                        [_parse_hit(hit) for data in fetched_data for hit in decode(data)],
                        ignore_index=True)
            df.drop_duplicates(subset="id",inplace=True)
            df.reset_index(drop=True,inplace=True)
        #-----Docs download-------
        print(f"fetch completed and collected [{df.shape[0]}] of docs,starting download docs..")
        total = df.shape[0]
        if total == 0:
            print("Fetch completed with 0 result. Now existing.")
            return
        with tqdm(total=total) as download_bar:
            for index in range(0,total,_buffer_chunk_size):
                index_range = list(range(index,min(index+_buffer_chunk_size,df.shape[0])))
                download_tasks = [
                    asyncio.create_task(_download(
                        semaphore=semaphore,
                        client=client,
                        df = df,
                        row_index=row,
                        download_bar=download_bar,
                        keywords=_PHRASES)
                    )for row in index_range
                    ]
                downloaded = await asyncio.gather(*download_tasks)
                


    del df["parser"] 
    del df['id']
    df.to_excel(f"{str(date.today())}.xlsx")   
    print(f"Data have been export at {os.getcwd()}\{str(date.today())}.xlsx") 
    return df


In [2]:
import time
import warnings
warnings.filterwarnings("ignore")
T0 = time.time()

_PHRASES = ["data ","cyber"]

_FILING_TYPES = [""]#,
#_FILING_TYPES = [""10-K","10-Q""]
_DATE_START = "2001-01-01"
_DATE_END = "2023-12-12"
_CIKS_PER_QUERY = 10
_buffer_chunk_size = 200
#_CIKS =  ["0001653482"] #Input from a list or a path,,"0001653481"
_CIKS = CIK(Path("sample_input_file.txt"))[2400:] ####Please mannually select N here for Testing for the first N CIKs###
df = pd.DataFrame()
df = await main(
    _PHRASES,
    _FILING_TYPES,
    _DATE_START,
    _DATE_END,
    _CIKS_PER_QUERY,
    _CIKS,
    _buffer_chunk_size,
    df
)
 
END = time.time()
print("--"*10,
      f"All tasks completed! Time Cost:{round((END-T0)/60,1)} minutes ",sep='\n')#pymupdf

  0%|          | 0/23 [00:00<?, ?it/s]

Starting fetch...


100%|██████████| 23/23 [00:14<00:00,  1.62it/s]


fetch completed and collected [311] of docs,starting download docs..


100%|██████████| 311/311 [03:58<00:00,  1.31it/s]


Data have been export at /mnt/d/summer_research\2023-12-22.xlsx
--------------------
All tasks completed! Time Cost:4.2 minutes 
