In [3]:
import asyncio
import json
import logging
from datetime import date
from itertools import zip_longest
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple, Union

import pandas as pd
from aiohttp import ClientSession
import regex
from tenacity import retry
import os

from constants import _FORMS, _LOCATIONS

class secc_2023:
    count = 0
    total = None
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}
    path = os.getcwd()
    _LOGGER = logging.getLogger(__name__)
    _CC_REGEX = regex.compile(R"[\p{Cc}\p{Cf}]+", regex.V1)
    _DISPLAY_NAME_REGEX = regex.compile(R"(.*) \(CIK (\d{10})\)", regex.V1)
    df = pd.DataFrame()
    error_list = list()
    
    def __init__(self,_PHRASES,_FILING_TYPES,_DATE_START,_CIKS,_DATE_END,_CIKS_PER_QUERY = 5, update_constant=False):
        self._PHRASES = _PHRASES
        self._FILING_TYPES = _FILING_TYPES
        self._DATE_START = _DATE_START
        self._DATE_END = _DATE_END
        #self._CIKS = self.CIK(_CIKS)
        self._CIKS_PER_QUERY = _CIKS_PER_QUERY
        self._CIKS = self.chop_ciks(_CIKS,_CIKS_PER_QUERY)#....
        self.total = len(list(self._CIKS))
        
        if (not os.path.exists(f"{self.path}constants.py")) or self.update_constant:
            self.__constant_update
    
    @staticmethod
    def CIK(file):
        with open(file, "r", encoding="UTF-8") as f:
            try:
                _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
                return _ciks
            except IOError as e:
                    raise ValueError(f"{file} is not a valid file") from e
                
    def chop_ciks(
    ciks: Optional[Union[Path, int, str, List[Any]]],
    ciks_per_query: int
) -> Generator[Optional[List[str]], None, None]:
    # defaults to None
        _ciks: Optional[List[str]] = None
        # if the provided parameter is a Path, read the CIKs from the file
        if isinstance(ciks, Path):
            try:
                with open(ciks, "r", encoding="UTF-8") as f:
                    try:
                        _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
                    except ValueError as e:
                        raise ValueError(f"{ciks} contains invalid CIKs") from e
            except IOError as e:
                raise ValueError(f"{ciks} is not a valid file") from e
        # if it's an iterable of values, treat all values as CIKs
        elif isinstance(ciks, list):
            try:
                _ciks = [f"{int(cik):010}" for cik in ciks]
            except ValueError as e:
                raise ValueError(f"{ciks} is not a valid CIK list") from e
        # if it's a single string, consider it as a single CIK
        elif isinstance(ciks, str):
            try:
                _ciks = [f"{int(ciks):010}"]
            except ValueError as e:
                raise ValueError(f"{ciks} is not a valid CIK") from e
        # same as previous with the preferred (int) type
        elif isinstance(ciks, int):
            _ciks = [f"{ciks:010}"]

        if _ciks:
            for i in range(0, len(_ciks), ciks_per_query):
                yield _ciks[i:i + ciks_per_query]
        else:
            yield None
           
                
    async def excution(self):
        semaphore = asyncio.Semaphore(10)
        async with ClientSession(raise_for_status=True, headers=self.headers) as client :
            for ciks_batch in self._CIKS:
            # Create tasks for fetching data
                    loop = asyncio.get_event_loop()
                    tasks = [
                        loop.create_task(self.fetch(semaphore 
                    = semaphore ,
                    client = client,
                    phrases=self._PHRASES,
                    cik=cik,
                    start=self._DATE_START,
                    end=self._DATE_END,
                    forms=form))
                        for cik in ciks_batch
                        for form in self._FILING_TYPES
                    ]
                    loop.run_until_complete(asyncio.wait(tasks))
                    #results = await asyncio.gather(*tasks)
                    # #results = await asyncio.gather(*tasks)
                    # # loop = asyncio.get_event_loop()
                    # # loop.run_until_complete(results)
                    # result = await asyncio.wait(tasks)
                    # asyncio.get_event_loop().run_until_complete(result)
                    self.count += self._CIKS_PER_QUERY
                    print(f"Complete {self.count}/{self.total}")
            self.df = self.df.reset_index(drop=True)
        #Wait for tasks to complete and unwrap the results
    
                    

   
        
    @staticmethod 
    async def __constant_update():
        async with ClientSession(raise_for_status=True) as c:
            async with c.get("https://www.sec.gov/edgar/search/js/edgar_full_text_search.js") as res:
                _script = await res.text()

            with open("constants.py", "w", encoding="utf-8") as f:
                f.write("_FORMS = ")
                json.dump({
                    form.pop("form"): form
                    for form in eval(re.search(
                        R"^const forms = (\[\r?\n(?: {4}\{.*?\},*\r?\n)*(?: {4}\{.*?\})\r?\n\])\.sort",
                        _script,
                        regex.MULTILINE
                    )[1])
                }, f, indent=4)

                f.write('\n')

                f.write("_LOCATIONS = ")
                json.dump(dict(eval(re.search(
                    R"^const locationsArray = (\[\r?\n(?: {4}\[.*?\],\r?\n)*(?: {4}\[.*?\])\r?\n\]);",
                    _script,
                    regex.MULTILINE
                )[1])), f, indent=4)

                f.write('\n')
    
    @retry
    async def fetch(self,semaphore,client,phrases,cik,end,forms,start='2001-01-01',range = 'custom',category= 'custom',entity=None):
        
        q = " ".join(f"\"{phrase}\"" for phrase in phrases)
        print(q)
        forms = " ".join(form for  form in forms) #iterator
        print(forms)
        data = {'q':q,
                'startdt':start,
                'enddt':end,
                'ciks':cik,
                'dataRange':'custom',
                'category':'custom',
                'forms':forms}
        url = 'https://efts.sec.gov/LATEST/search-index'
        
        tem_df = pd.DataFrame()
        async with semaphore,client.request(method='get',url=url,params = data) as res:
            await asyncio.sleep(1)
            if res.status == 200:
                url = res.url
                print(url)
                json = await res.json()
                #page_info = json['query'] --- Unconvered empty input ciks and needed of page swtich 
                total = int(json["hits"]["total"]["value"])
                hits = json["hits"]["hits"]
                # match_result = regex.search(r'"from":(\d+),"size":(\d+),"aggregations":', page_info) 
                # if match_result:
                #     from_value = match_result.group(1)
                #     size_value = match_result.group(2)
                #if total > 30 : #show limited of per page
                #self.error_list.append(data)
                #raise ValueError(f"CIK = {cik} fetch error. Status Code = {res.status}")
        for hit in hits:
            tem_df = pd.concat([tem_df,self._parse_hit(hit)])
        
        try:
            total == tem_df.shape[0]
        except:
            print(f"Length of CIK:{cik} is not equal")
        finally:
            return pd.concat([self.df,tem_df])
                        


    def _concat_to_url(self,cik: str, adsh: str, filename: str) -> str:
        return f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh}/{filename}"



    @retry
    async def _download(self,semaphore: asyncio.Semaphore, url: str, client) -> Tuple[bytes, str]:
        async with semaphore, client.get(url) as res:
            await asyncio.sleep(1)
            if res.ok:
                content = await res.read()
                return content
            raise ValueError(f"Status code : {res.status}")


    def _parse_display_name(self,s: str, cik: str):
        if s is not None and (m := self._DISPLAY_NAME_REGEX.fullmatch(s)):
            if (scik := m[2]) != cik:
                self._LOGGER.warning(f"mismatched CIK: {scik} (parsed from \"{s}\") v.s. {cik}")
            return m[1], scik
        return s, cik

    @staticmethod
    def chop_ciks(
        ciks: Optional[Union[Path, int, str, List[Any]]],
        ciks_per_query: int
    ) -> Generator[Optional[List[str]], None, None]:
        # defaults to None
        _ciks: Optional[List[str]] = None
        # if the provided parameter is a Path, read the CIKs from the file
        if isinstance(ciks, Path):
            try:
                with open(ciks, "r", encoding="UTF-8") as f:
                    try:
                        _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
                    except ValueError as e:
                        raise ValueError(f"{ciks} contains invalid CIKs") from e
            except IOError as e:
                raise ValueError(f"{ciks} is not a valid file") from e
        # if it's an iterable of values, treat all values as CIKs
        elif isinstance(ciks, list):
            try:
                _ciks = [f"{int(cik):010}" for cik in ciks]
            except ValueError as e:
                raise ValueError(f"{ciks} is not a valid CIK list") from e
        # if it's a single string, consider it as a single CIK
        elif isinstance(ciks, str):
            try:
                _ciks = [f"{int(ciks):010}"]
            except ValueError as e:
                raise ValueError(f"{ciks} is not a valid CIK") from e
        # same as previous with the preferred (int) type
        elif isinstance(ciks, int):
            _ciks = [f"{ciks:010}"]

        if _ciks:
            for i in range(0, len(_ciks), ciks_per_query):
                yield _ciks[i:i + ciks_per_query]
        else:
            yield None


    def _parse_hit(self,hit: Dict[str, Any]): 
        _id = hit["_id"]
        source = hit["_source"]

        adsh, filename = _id.split(':')
        filename_main, filename_ext = filename.rsplit('.', 1)
        xsl = source["xsl"]
        if xsl and filename_ext.lower() == "xml":
            filename_main = f"{xsl}/{filename_main}"
        filename = f"{filename_main}.{filename_ext}"
        file_nums = source["file_num"]
        film_nums = source["film_num"]
        rows = pd.DataFrame((
            [_id, *self._parse_display_name(display_name, cik), loc.split(",")[0], _LOCATIONS.get(code), file_num, film_num]
            for display_name, cik, loc, code, file_num, film_num in zip_longest(
                source["display_names"],
                source["ciks"],
                source["biz_locations"],
                source["biz_states"], 
                file_nums if isinstance(file_nums, list) else [file_nums] if file_nums else (),
                film_nums if isinstance(film_nums, list) else [film_nums] if film_nums else ()
            )
        ), columns=["id", "entity_name", "cik", "located", "incorporated", "file_num", "film_num"], copy=False)#, dtype=str
        form = source["form"]
        root_form = source["root_form"]
        form_title = ""
        if root_form in _FORMS:
            form_title = f" ({_FORMS[root_form]['title']})"
        file_type = source["file_type"]
        if not file_type:
            file_type = source["file_description"]
        if not file_type:
            file_type = filename
        ciks = rows.loc[0,"cik"]
        info = pd.DataFrame({
            "entity_name":rows['entity_name'],
            "id": _id,
            "form_file": f"{form}{form_title}{'' if form == file_type else f' {file_type}'}",
            "file_date": source["file_date"],
            "period_ending": source.get("period_ending", None),
            "file_ext": filename_ext,
            "url": self._concat_to_url(ciks, adsh.replace('-', ''), filename),
            "parser": None#getattr(parsers, f"_parse_{filename_ext.lower()}", None)
        },copy=False,dtype=str)#, dtype=object
        result = pd.merge(rows,info,how="left",on="id")
        del result["id"]
        return result.reset_index(drop=True)

    @staticmethod
    def extract_paragraphs(html, keywords):#Need to support more extension like pdf,txt and etc?
        paragraph_pattern = regex.compile(r'<span.*?>(.*?)</span>', regex.DOTALL)
        paragraphs = paragraph_pattern.findall(html)

        result_paragraphs = [paragraph for paragraph in paragraphs if any(keyword in paragraph for keyword in keywords)]

        return result_paragraphs

    def CIK(file):
        with open(file, "r", encoding="UTF-8") as f:
            try:
                _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
                return _ciks
            except IOError as e:
                    raise ValueError(f"{file} is not a valid file") from e
    


In [4]:
import asyncio
from aiohttp import ClientSession
from pathlib import Path

async def main():
    _PHRASES = ["data breach"]
    _FILING_TYPES = ["10-Q"]
    _DATE_START = "2000-12-01"
    _DATE_END = "2023-12-12"
    _CIKS = Path("sample_input_file.txt")

    scratch = secc_2023(_PHRASES, _FILING_TYPES, _DATE_START, _CIKS, _DATE_END)
    semaphore = asyncio.Semaphore(10)

    async with ClientSession(raise_for_status=True, headers=scratch.headers) as client:
        for ciks in scratch._CIKS:
            for form in scratch._FILING_TYPES:
                tasks = [
                    scratch.fetch(
                        semaphore=semaphore,
                        client=client,
                        phrases=scratch._PHRASES,
                        cik=cik,
                        start=scratch._DATE_START,
                        end=scratch._DATE_END,
                        forms=form
                    ) for cik in ciks
                ]
                await asyncio.gather(*tasks)

if __name__ == "__main__":
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [5]:
_PHRASES = ["data breach"]
_FILING_TYPES = ["10-Q"]
_DATE_START = "2000-12-01"
_DATE_END = "2023-12-12"
_CIKS = Path("sample_input_file.txt")

scratch = secc_2023(_PHRASES, _FILING_TYPES, _DATE_START, _CIKS, _DATE_END)
semaphore = asyncio.Semaphore(10)

In [9]:
for i in scratch._CIKS:
    i.next