In [1]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup
# !pip install -U sec-edgar-downloader
from sec_edgar_downloader import Downloader
from tqdm import tqdm
import re
from urllib.parse import urlparse
# import ast
import sys
import json
import os
import gc

# Preparation

In [2]:
#API extraction function & Definition of headers:

headers_data = {
    "User-Agent": "Ludwig Kunz (ludwig.kunz@uni-konstanz.de)",
    "Accept-Encoding": "gzip, deflate",
    "Host": "data.sec.gov"
}
headers_doc = {
        "User-Agent": "Ludwig Kunz (ludwig.kunz@uni-konstanz.de)",
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.sec.gov"
}

def get_json(url, headers):
    response = rq.get(url, headers = headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise ImportError(response.status_code)

In [11]:
sp500_changes = pd.read_csv("s&p500_hist_changes.csv") # from https://github.com/fja05680/sp500
sp500_changes["date"] = pd.to_datetime(sp500_changes["date"])
sp500_changes["year"] = sp500_changes["date"].dt.year 
sp500_changes = sp500_changes[((sp500_changes["year"] >= 2005) & (sp500_changes["year"] < 2025))]
sp500_changes.reset_index(drop = True,inplace=True)
sp500_changes["tickers"] = sp500_changes["tickers"].apply(lambda x: x.split(","))

sp500_changes

#iterating over the dataset and creating intersections, to get permanent constituents
common_tickers = set(sp500_changes.iloc[0]["tickers"])
for _, row in sp500_changes[1:].iterrows():
    common_tickers &= set(row["tickers"])
common_tickers = list(common_tickers)

#converting the permanent constituents tickers to ciks
url = "https://www.sec.gov/files/company_tickers.json"
company_tickers = get_json(url, headers_doc)
company_tickers = list(company_tickers.values())
company_tickers = pd.DataFrame(company_tickers)
company_tickers

company_tickers = company_tickers[company_tickers["ticker"].isin(common_tickers)]
company_tickers["cik"] = company_tickers["cik_str"].apply(str).str.zfill(10)
company_tickers = company_tickers[["ticker", "cik"]]
matched_tickers = set(company_tickers["ticker"])
missing_tickers = list(set(common_tickers) - matched_tickers)

df_missing = pd.DataFrame({
    "ticker": missing_tickers,
    "cik": [pd.NA] * len(missing_tickers)
})

company_tickers = pd.concat([company_tickers, df_missing], ignore_index=True)

print("Missing ticker(s):", missing_tickers)

#manually filling the missing CIKs:
company_tickers.loc[company_tickers["ticker"] == "BF.B", "cik"] = "0000014693"
company_tickers.loc[company_tickers["ticker"] == "HES", "cik"] = "0000004447"
company_tickers.loc[company_tickers["ticker"] == "PPL", "cik"] = "0000922224"

# company_tickers = company_tickers[-1:] #shortening, for checks and bug fixing
# company_tickers = company_tickers[company_tickers["ticker"] == "ORCL"]

display(company_tickers)

Missing ticker(s): ['HES', 'PPL', 'BF.B']


Unnamed: 0,ticker,cik
0,NVDA,0001045810
1,MSFT,0000789019
2,AAPL,0000320193
3,JPM,0000019617
4,WMT,0000104169
...,...,...
210,APA,0001841666
211,FITB,0000035527
212,HES,0000004447
213,PPL,0000922224


# Webscraping

In [None]:
# --------------------------Preparation:--------------------------

cols_tokeep = ["accessionNumber", "reportDate", "primaryDocument"]

#link fragmenting and pattern matching functions:


def extract_section_1A(html, start_href, end_href):
    """
    returns a list of text fragments in section 1A from a BeautifulSoup input of an entire 10-k html
    """
    status = "Starting"
    def fragment(href):
        parsed = urlparse(href)
        return parsed.fragment if parsed.fragment else href.lstrip("#")
    start_id = fragment(start_href)
    end_id = fragment(end_href)

    start_tag = html.find(id = start_id) or html.find("a", attrs = {"name": start_id})
    end_tag = html.find(id = end_id) or html.find("a", attrs = {"name": end_id})

    if not start_tag:
        status = f"Could not locate start anchor #{start_id}"
        content = None
    elif not end_tag: 
        status = f"Could not locate end anchor #{end_id}"
        content = None
    else:
        status = "Worked"
        content_fragments = []
        for sib in start_tag.next_elements: #problem here?
            if sib is end_tag:
                break
            content_fragments.append(sib)
        content = []
        for element in content_fragments:
            if hasattr(element, "get_text"):
                text = element.get_text(strip = True)
            else:
                text = str(element).strip()
            if text:
                content.append(text)
    return content, status

# patterns to find the appropriate sections:
patterns_1A = [re.compile(r'\bITEM\s*1A\b', flags=re.IGNORECASE), re.compile(r'\brisk\s*factors\b', flags=re.IGNORECASE)]
patterns_1B = [re.compile(r'\bITEM\s*1B\b', flags=re.IGNORECASE), re.compile(r'\bunresolved\s*staff\s*comments\b', flags=re.IGNORECASE), re.compile(r'\bunresolved\s*sec\s*staff\s*comments\b', flags=re.IGNORECASE)]

#creating the directory for the extracted sections:
os.makedirs("section_1A", exist_ok = True)

#building the skeleton of the errors dataframe:
errors = pd.DataFrame({
    "Company": pd.Series(dtype="string"),
    "FiscalYearEnd": pd.Series(dtype="datetime64[ns]"),
    "Status": pd.Series(dtype="string"),
    "URL": pd.Series(dtype="string")
})

# --------------------------Full loop:--------------------------

total_documents = 0
for i_comp, row_comp in company_tickers.iterrows():
    status = f'Webscraping for company: {row_comp["ticker"]} ({i_comp+1}/{len(company_tickers)})'
    print('\r\033[K' + status, end='')
    sys.stdout.flush()

    cik = row_comp["cik"]
    cik_no_zeros = str(int(cik))

    # obtaining the document links:
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    submissions = get_json(url, headers_data)

    comp_ticker = row_comp["ticker"]
    filings = submissions["filings"]

    recent = pd.DataFrame(filings["recent"])
    recent = recent[recent["form"] == "10-K"].drop(columns = [col for col in recent.columns if col not in cols_tokeep])
    dataframes = [recent]

    for file_dict in filings["files"]:
        filename = file_dict["name"]
        url = f"https://data.sec.gov/submissions/{filename}"
        older_subm = pd.DataFrame(get_json(url, headers_data))
        older_subm = older_subm[older_subm["form"] == "10-K"].drop(columns = [col for col in older_subm.columns if col not in cols_tokeep])
        dataframes.append(older_subm)
    
    df = pd.concat(dataframes, ignore_index = True)
    df["reportDate"] = pd.to_datetime(df["reportDate"])
    df["reportYear"] = df["reportDate"].dt.year
    df = df[df["reportYear"] >= 2005]

    total_documents += len(df)

    dict_1A = {}

    #iterating over all years for the company in question:
    for doc_i, doc_row in tqdm(df.iterrows(), total = len(df)):
        accno = doc_row["accessionNumber"]
        acc_no_no_dash = accno.replace("-", "")
        doclink = doc_row["primaryDocument"]
        url = f'https://www.sec.gov/Archives/edgar/data/{cik_no_zeros}/{acc_no_no_dash}/{doclink}'
        df.at[doc_i, "url"] = url
        
        # downloading the 10-K filing
        response = rq.get(url, headers = headers_doc)
        content = BeautifulSoup(response.content, "html.parser")

        link_1A = None
        link_1B = None
        link_list = [{"link": link, "text": " ".join(link.get_text(separator = " ").split())} for link in content.find_all("a")] 

        # pattern matching for item 1A:
        idx = 0
        while link_1A == None and idx < len(patterns_1A):
            for link_dict in link_list:
                if patterns_1A[idx].search(link_dict["text"]):
                    link_1A = link_dict["link"].get("href")
                    break
            idx += 1
        
        #pattern matching for item 1B:
        idx = 0
        while link_1B == None and idx < len(patterns_1B):
            for link_dict in link_list:
                if patterns_1B[idx].search(link_dict["text"]):
                    link_1B = link_dict["link"].get("href")
                    break
            idx += 1

        #updating the status, using the extraction function, if the link finding worked:
        if link_1A == None:
            sec_1A = None
            extract_status = f'Could not find start link'
        elif link_1B == None:
            sec_1A = None
            extract_status = f'Could not find end link'
        else:
            sec_1A, extract_status = extract_section_1A(content, link_1A, link_1B)
        
        #attempting to find the headings for the sections, in case the link method did not work
        if sec_1A == None:
            bold_tags = content.find_all(["b", "strong"])
            styled_bold = content.find_all(
                lambda tag: tag.has_attr("style") and "font-weight" in tag["style"].lower() and "bold" in tag["style"].lower())
            all_bold = set(bold_tags + styled_bold)

            tag_list = [{"fulltag": tag, "text": " ".join(tag.get_text(separator = " ").split())} for tag in all_bold]
            tag_1A = None
            tag_1B = None

            idx = 0
            while tag_1A == None and idx < len(patterns_1A):
                for tag_dict in tag_list:
                    if patterns_1A[idx].search(tag_dict["text"]):
                        tag_1A = tag_dict["fulltag"]
                        break
                idx += 1
            
            idx = 0
            while tag_1B == None and idx < len(patterns_1B):
                for tag_dict in tag_list:
                    if patterns_1B[idx].search(tag_dict["text"]):
                        tag_1B = tag_dict["fulltag"]
                        break
                idx += 1
            
            if tag_1A == None:
                extract_status = "Bold tag Method: Could not find start tag"
            elif tag_1B == None:
                extract_status = "Bold tag Method: Could not find end tag"
            else:
                divlist = content.find_all("div")
                start_idx = None
                end_idx = None

                for div_i, div in enumerate(divlist):
                    if tag_1A in div:
                        start_idx = div_i
                    if tag_1B in div:
                        end_idx = div_i
                
                if start_idx == None:
                    extract_status = "Bold tag Method: start index could not be located" 
                elif end_idx == None:
                    extract_status = "Bold tag Method: end index could not be located"
                elif start_idx >= end_idx:
                    extract_status = "Bold tag Method: start index is not smaller than end index"
                else:
                    content_fragments = divlist[start_idx:end_idx]
                    sec_1A = []
                    for element in content_fragments:
                        if hasattr(element, "get_text"):
                            text = element.get_text(strip = True)
                        else:
                            text = str(element).strip()
                        if text: sec_1A.append(text)
                    
                    if len(sec_1A) == 0:
                        extract_status = "Bold tag method returned an empty list"

        elif len(sec_1A) == 0:
            extract_status = "Extract function: returned an empty list"    

        #passing the (hopefully) extracted section to its dictionary
        dict_1A.update({doc_row["reportYear"]: {"link_1A": link_1A, 
                                                "link_1B": link_1B,
                                                "URL": url,
                                                "section_1A": sec_1A}
                                                })
        
        #updating the error dict, in case it didn't work
        if sec_1A == None or len(sec_1A) == 0:
            error_row = {"Company": comp_ticker,
                        "FiscalYearEnd": doc_row["reportDate"],
                        "Status": extract_status,
                        "URL": url}
            errors = pd.concat([errors, pd.DataFrame([error_row])], ignore_index = True)

        del content
        gc.collect()

    #saving the json file
    with open(f'section_1A/{comp_ticker}.json', 'w') as file:
        json.dump(dict_1A, file)
    del dict_1A

    gc.collect()
    print(f'{len([row for _, row in errors.iterrows() if row["Company"] == comp_ticker])} errors found')

[KWebscraping for company: NVDA (1/215)

100%|██████████| 21/21 [00:25<00:00,  1.20s/it]

3 errors found
[KWebscraping for company: MSFT (2/215)


100%|██████████| 21/21 [00:40<00:00,  1.91s/it]


1 errors found
[KWebscraping for company: AAPL (3/215)

100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


7 errors found
[KWebscraping for company: JPM (4/215)

100%|██████████| 20/20 [04:16<00:00, 12.82s/it]


2 errors found
[KWebscraping for company: WMT (5/215)

100%|██████████| 21/21 [00:21<00:00,  1.00s/it]

7 errors found
[KWebscraping for company: ORCL (6/215)


100%|██████████| 20/20 [00:35<00:00,  1.75s/it]


0 errors found
[KWebscraping for company: LLY (7/215)

100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


5 errors found
[KWebscraping for company: XOM (8/215)

100%|██████████| 20/20 [01:07<00:00,  3.38s/it]


13 errors found
[KWebscraping for company: COST (9/215)

100%|██████████| 20/20 [00:22<00:00,  1.14s/it]


1 errors found
[KWebscraping for company: JNJ (10/215)

100%|██████████| 21/21 [00:25<00:00,  1.23s/it]

5 errors found
[KWebscraping for company: HD (11/215)


100%|██████████| 21/21 [00:19<00:00,  1.09it/s]

3 errors found
[KWebscraping for company: PG (12/215)


100%|██████████| 20/20 [00:20<00:00,  1.02s/it]

8 errors found
[KWebscraping for company: BAC (13/215)


100%|██████████| 20/20 [04:13<00:00, 12.68s/it]


4 errors found
[KWebscraping for company: CVX (14/215)

100%|██████████| 20/20 [01:00<00:00,  3.03s/it]


0 errors found
[KWebscraping for company: KO (15/215)

100%|██████████| 20/20 [00:37<00:00,  1.86s/it]

1 errors found
[KWebscraping for company: GE (16/215)


100%|██████████| 20/20 [00:59<00:00,  2.97s/it]

13 errors found
[KWebscraping for company: CSCO (17/215)


100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


1 errors found
[KWebscraping for company: WFC (18/215)

100%|██████████| 20/20 [00:14<00:00,  1.40it/s]

11 errors found
[KWebscraping for company: IBM (19/215)


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]

0 errors found
[KWebscraping for company: MS (20/215)


100%|██████████| 20/20 [01:40<00:00,  5.00s/it]


0 errors found
[KWebscraping for company: UNH (21/215)

100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


1 errors found
[KWebscraping for company: GS (22/215)

100%|██████████| 20/20 [01:40<00:00,  5.01s/it]


3 errors found
[KWebscraping for company: ABT (23/215)

100%|██████████| 20/20 [00:30<00:00,  1.52s/it]

11 errors found
[KWebscraping for company: INTU (24/215)


100%|██████████| 20/20 [00:34<00:00,  1.72s/it]


1 errors found
[KWebscraping for company: MCD (25/215)

100%|██████████| 20/20 [00:30<00:00,  1.54s/it]


6 errors found
[KWebscraping for company: DIS (26/215)

100%|██████████| 6/6 [00:11<00:00,  1.91s/it]


0 errors found
[KWebscraping for company: AXP (27/215)

100%|██████████| 20/20 [00:40<00:00,  2.03s/it]


0 errors found
[KWebscraping for company: CAT (28/215)

100%|██████████| 21/21 [00:59<00:00,  2.81s/it]

1 errors found
[KWebscraping for company: MRK (29/215)


100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


1 errors found
[KWebscraping for company: T (30/215)

100%|██████████| 20/20 [00:20<00:00,  1.03s/it]

19 errors found
[KWebscraping for company: PEP (31/215)


100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


0 errors found
[KWebscraping for company: VZ (32/215)

100%|██████████| 20/20 [00:17<00:00,  1.16it/s]


0 errors found
[KWebscraping for company: TMO (33/215)

100%|██████████| 20/20 [00:39<00:00,  1.95s/it]

0 errors found
[KWebscraping for company: SCHW (34/215)


100%|██████████| 20/20 [00:47<00:00,  2.35s/it]

12 errors found
[KWebscraping for company: C (35/215)


100%|██████████| 20/20 [03:31<00:00, 10.55s/it]


17 errors found
[KWebscraping for company: SPGI (36/215)

100%|██████████| 20/20 [00:34<00:00,  1.71s/it]


0 errors found
[KWebscraping for company: BA (37/215)

100%|██████████| 20/20 [00:36<00:00,  1.85s/it]


0 errors found
[KWebscraping for company: TXN (38/215)

100%|██████████| 20/20 [00:19<00:00,  1.01it/s]

14 errors found
[KWebscraping for company: QCOM (39/215)


100%|██████████| 20/20 [00:23<00:00,  1.18s/it]


1 errors found
[KWebscraping for company: AMGN (40/215)

100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


0 errors found
[KWebscraping for company: BSX (41/215)

100%|██████████| 20/20 [00:35<00:00,  1.75s/it]


2 errors found
[KWebscraping for company: ADBE (42/215)

100%|██████████| 20/20 [00:32<00:00,  1.63s/it]

0 errors found
[KWebscraping for company: SYK (43/215)


100%|██████████| 20/20 [00:20<00:00,  1.04s/it]

6 errors found
[KWebscraping for company: ETN (44/215)


100%|██████████| 13/13 [00:23<00:00,  1.77s/it]

0 errors found
[KWebscraping for company: NEE (45/215)


100%|██████████| 20/20 [00:57<00:00,  2.89s/it]

3 errors found
[KWebscraping for company: AMAT (46/215)


100%|██████████| 20/20 [00:30<00:00,  1.55s/it]


1 errors found
[KWebscraping for company: DHR (47/215)

100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


0 errors found
[KWebscraping for company: PGR (48/215)

100%|██████████| 20/20 [00:14<00:00,  1.42it/s]

10 errors found
[KWebscraping for company: DE (49/215)


100%|██████████| 20/20 [00:50<00:00,  2.52s/it]

9 errors found
[KWebscraping for company: HON (50/215)


100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


0 errors found
[KWebscraping for company: GILD (51/215)

100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


0 errors found
[KWebscraping for company: TJX (52/215)

100%|██████████| 21/21 [00:22<00:00,  1.09s/it]


10 errors found
[KWebscraping for company: COF (53/215)

100%|██████████| 20/20 [01:58<00:00,  5.92s/it]


0 errors found
[KWebscraping for company: PFE (54/215)

100%|██████████| 20/20 [00:18<00:00,  1.07it/s]

8 errors found
[KWebscraping for company: UNP (55/215)


100%|██████████| 20/20 [00:25<00:00,  1.30s/it]


0 errors found
[KWebscraping for company: ADP (56/215)

100%|██████████| 20/20 [00:26<00:00,  1.33s/it]

6 errors found
[KWebscraping for company: LOW (57/215)


100%|██████████| 21/21 [00:20<00:00,  1.02it/s]

2 errors found
[KWebscraping for company: CMCSA (58/215)


100%|██████████| 20/20 [00:38<00:00,  1.94s/it]


0 errors found
[KWebscraping for company: MU (59/215)

100%|██████████| 20/20 [00:27<00:00,  1.39s/it]

5 errors found
[KWebscraping for company: COP (60/215)


100%|██████████| 20/20 [01:13<00:00,  3.68s/it]


0 errors found
[KWebscraping for company: KLAC (61/215)

100%|██████████| 20/20 [00:29<00:00,  1.50s/it]


1 errors found
[KWebscraping for company: MDT (62/215)

100%|██████████| 11/11 [00:30<00:00,  2.74s/it]


0 errors found
[KWebscraping for company: ADI (63/215)

100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


3 errors found
[KWebscraping for company: NKE (64/215)

100%|██████████| 21/21 [00:29<00:00,  1.41s/it]


1 errors found
[KWebscraping for company: CB (65/215)

100%|██████████| 20/20 [01:52<00:00,  5.63s/it]


0 errors found
[KWebscraping for company: MO (66/215)

100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


0 errors found
[KWebscraping for company: SO (67/215)

100%|██████████| 20/20 [05:19<00:00, 16.00s/it]


0 errors found
[KWebscraping for company: SBUX (68/215)

100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


0 errors found
[KWebscraping for company: PLD (69/215)

100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


1 errors found
[KWebscraping for company: MMC (70/215)

100%|██████████| 20/20 [00:37<00:00,  1.89s/it]


2 errors found
[KWebscraping for company: LMT (71/215)

100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


0 errors found
[KWebscraping for company: DUK (72/215)

100%|██████████| 19/19 [03:46<00:00, 11.94s/it]


2 errors found
[KWebscraping for company: PH (73/215)

100%|██████████| 20/20 [00:18<00:00,  1.11it/s]

7 errors found
[KWebscraping for company: MCO (74/215)


100%|██████████| 20/20 [00:39<00:00,  1.96s/it]


3 errors found
[KWebscraping for company: WM (75/215)

100%|██████████| 20/20 [00:40<00:00,  2.04s/it]


0 errors found
[KWebscraping for company: CTAS (76/215)

100%|██████████| 21/21 [00:32<00:00,  1.56s/it]


3 errors found
[KWebscraping for company: BMY (77/215)

100%|██████████| 20/20 [00:48<00:00,  2.44s/it]


8 errors found
[KWebscraping for company: MCK (78/215)

100%|██████████| 21/21 [00:33<00:00,  1.60s/it]


1 errors found
[KWebscraping for company: INTC (79/215)

100%|██████████| 20/20 [00:42<00:00,  2.15s/it]


5 errors found
[KWebscraping for company: GD (80/215)

100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


0 errors found
[KWebscraping for company: SHW (81/215)

100%|██████████| 20/20 [00:14<00:00,  1.39it/s]

2 errors found
[KWebscraping for company: NOC (82/215)


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]


0 errors found
[KWebscraping for company: EMR (83/215)

100%|██████████| 20/20 [00:14<00:00,  1.41it/s]

9 errors found
[KWebscraping for company: MMM (84/215)


100%|██████████| 20/20 [00:53<00:00,  2.66s/it]


0 errors found
[KWebscraping for company: CVS (85/215)

100%|██████████| 21/21 [00:18<00:00,  1.12it/s]

1 errors found
[KWebscraping for company: AON (86/215)


100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


1 errors found
[KWebscraping for company: PNC (87/215)

100%|██████████| 20/20 [01:35<00:00,  4.78s/it]


0 errors found
[KWebscraping for company: ITW (88/215)

100%|██████████| 20/20 [00:28<00:00,  1.43s/it]

5 errors found
[KWebscraping for company: ECL (89/215)


100%|██████████| 20/20 [00:37<00:00,  1.89s/it]

3 errors found
[KWebscraping for company: MSI (90/215)


100%|██████████| 20/20 [00:36<00:00,  1.84s/it]


7 errors found
[KWebscraping for company: UPS (91/215)

100%|██████████| 20/20 [00:43<00:00,  2.17s/it]

3 errors found
[KWebscraping for company: WMB (92/215)


100%|██████████| 20/20 [00:53<00:00,  2.66s/it]


0 errors found
[KWebscraping for company: CI (93/215)

100%|██████████| 7/7 [00:15<00:00,  2.28s/it]


1 errors found
[KWebscraping for company: MAR (94/215)

100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


0 errors found
[KWebscraping for company: BK (95/215)

100%|██████████| 18/18 [00:08<00:00,  2.14it/s]

1 errors found
[KWebscraping for company: USB (96/215)


100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


15 errors found
[KWebscraping for company: JCI (97/215)

100%|██████████| 20/20 [00:50<00:00,  2.50s/it]


3 errors found
[KWebscraping for company: CL (98/215)

100%|██████████| 20/20 [00:35<00:00,  1.78s/it]

8 errors found
[KWebscraping for company: NEM (99/215)


100%|██████████| 20/20 [01:16<00:00,  3.82s/it]

16 errors found
[KWebscraping for company: CSX (100/215)


100%|██████████| 20/20 [00:35<00:00,  1.77s/it]

0 errors found
[KWebscraping for company: EOG (101/215)


100%|██████████| 20/20 [00:34<00:00,  1.72s/it]

11 errors found
[KWebscraping for company: ADSK (102/215)


100%|██████████| 21/21 [00:32<00:00,  1.54s/it]


1 errors found
[KWebscraping for company: APD (103/215)

100%|██████████| 20/20 [00:33<00:00,  1.70s/it]


1 errors found
[KWebscraping for company: AZO (104/215)

100%|██████████| 20/20 [00:20<00:00,  1.02s/it]


3 errors found
[KWebscraping for company: NSC (105/215)

100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


7 errors found
[KWebscraping for company: SPG (106/215)

100%|██████████| 20/20 [00:57<00:00,  2.87s/it]

3 errors found
[KWebscraping for company: AEP (107/215)


100%|██████████| 20/20 [00:50<00:00,  2.53s/it]

7 errors found
[KWebscraping for company: TRV (108/215)


100%|██████████| 20/20 [01:22<00:00,  4.12s/it]


0 errors found
[KWebscraping for company: FCX (109/215)

100%|██████████| 20/20 [01:58<00:00,  5.90s/it]


0 errors found
[KWebscraping for company: ALL (110/215)

100%|██████████| 20/20 [01:49<00:00,  5.46s/it]


5 errors found
[KWebscraping for company: GLW (111/215)

100%|██████████| 20/20 [00:43<00:00,  2.15s/it]


16 errors found
[KWebscraping for company: AFL (112/215)

100%|██████████| 20/20 [01:29<00:00,  4.50s/it]


0 errors found
[KWebscraping for company: SRE (113/215)

100%|██████████| 20/20 [00:48<00:00,  2.44s/it]


11 errors found
[KWebscraping for company: FDX (114/215)

100%|██████████| 21/21 [00:42<00:00,  2.03s/it]


1 errors found
[KWebscraping for company: PAYX (115/215)

100%|██████████| 21/21 [00:25<00:00,  1.23s/it]

1 errors found
[KWebscraping for company: PCAR (116/215)


100%|██████████| 20/20 [00:28<00:00,  1.40s/it]

8 errors found
[KWebscraping for company: BDX (117/215)


100%|██████████| 20/20 [00:24<00:00,  1.21s/it]


4 errors found
[KWebscraping for company: MET (118/215)

100%|██████████| 20/20 [03:19<00:00,  9.98s/it]


1 errors found
[KWebscraping for company: CMI (119/215)

100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


0 errors found
[KWebscraping for company: SLB (120/215)

100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


0 errors found
[KWebscraping for company: GWW (121/215)

100%|██████████| 20/20 [00:22<00:00,  1.13s/it]


9 errors found
[KWebscraping for company: D (122/215)

100%|██████████| 20/20 [01:28<00:00,  4.42s/it]


1 errors found
[KWebscraping for company: KR (123/215)

100%|██████████| 21/21 [00:38<00:00,  1.85s/it]


15 errors found
[KWebscraping for company: TGT (124/215)

100%|██████████| 21/21 [00:27<00:00,  1.32s/it]

2 errors found
[KWebscraping for company: AIG (125/215)


100%|██████████| 20/20 [02:53<00:00,  8.68s/it]


2 errors found
[KWebscraping for company: EXC (126/215)

100%|██████████| 20/20 [05:03<00:00, 15.17s/it]


0 errors found
[KWebscraping for company: PEG (127/215)

100%|██████████| 20/20 [01:44<00:00,  5.24s/it]


6 errors found
[KWebscraping for company: F (128/215)

100%|██████████| 20/20 [01:20<00:00,  4.05s/it]


6 errors found
[KWebscraping for company: OXY (129/215)

100%|██████████| 20/20 [00:55<00:00,  2.79s/it]


5 errors found
[KWebscraping for company: VLO (130/215)

100%|██████████| 20/20 [00:51<00:00,  2.57s/it]


0 errors found
[KWebscraping for company: EBAY (131/215)

100%|██████████| 20/20 [00:36<00:00,  1.81s/it]


5 errors found
[KWebscraping for company: XEL (132/215)

100%|██████████| 20/20 [01:07<00:00,  3.39s/it]


0 errors found
[KWebscraping for company: KMB (133/215)

100%|██████████| 20/20 [00:29<00:00,  1.47s/it]


0 errors found
[KWebscraping for company: CCL (134/215)

100%|██████████| 20/20 [00:14<00:00,  1.36it/s]

6 errors found
[KWebscraping for company: YUM (135/215)


100%|██████████| 20/20 [00:36<00:00,  1.85s/it]

7 errors found
[KWebscraping for company: ETR (136/215)


100%|██████████| 20/20 [05:07<00:00, 15.36s/it]


16 errors found
[KWebscraping for company: ROK (137/215)

100%|██████████| 20/20 [00:28<00:00,  1.43s/it]

3 errors found
[KWebscraping for company: EA (138/215)


100%|██████████| 21/21 [00:28<00:00,  1.36s/it]


1 errors found
[KWebscraping for company: SYY (139/215)

100%|██████████| 20/20 [00:36<00:00,  1.84s/it]


6 errors found
[KWebscraping for company: HSY (140/215)

100%|██████████| 20/20 [00:30<00:00,  1.51s/it]

7 errors found
[KWebscraping for company: CAH (141/215)


100%|██████████| 20/20 [00:26<00:00,  1.33s/it]


11 errors found
[KWebscraping for company: ED (142/215)

100%|██████████| 20/20 [00:59<00:00,  2.98s/it]


1 errors found
[KWebscraping for company: PRU (143/215)

100%|██████████| 20/20 [03:02<00:00,  9.10s/it]


0 errors found
[KWebscraping for company: VMC (144/215)

100%|██████████| 18/18 [00:30<00:00,  1.72s/it]


0 errors found
[KWebscraping for company: HIG (145/215)

100%|██████████| 20/20 [01:44<00:00,  5.25s/it]


9 errors found
[KWebscraping for company: NUE (146/215)

100%|██████████| 20/20 [00:15<00:00,  1.33it/s]


0 errors found
[KWebscraping for company: A (147/215)

100%|██████████| 20/20 [00:34<00:00,  1.74s/it]

1 errors found
[KWebscraping for company: STT (148/215)


100%|██████████| 20/20 [01:27<00:00,  4.36s/it]


6 errors found
[KWebscraping for company: MTB (149/215)

100%|██████████| 20/20 [01:35<00:00,  4.78s/it]


1 errors found
[KWebscraping for company: HUM (150/215)

100%|██████████| 20/20 [00:40<00:00,  2.03s/it]


3 errors found
[KWebscraping for company: EFX (151/215)

100%|██████████| 20/20 [00:28<00:00,  1.40s/it]

4 errors found
[KWebscraping for company: DTE (152/215)


100%|██████████| 20/20 [01:17<00:00,  3.89s/it]


1 errors found
[KWebscraping for company: K (153/215)

100%|██████████| 21/21 [00:35<00:00,  1.70s/it]

12 errors found
[KWebscraping for company: AEE (154/215)


100%|██████████| 20/20 [01:29<00:00,  4.47s/it]


0 errors found
[KWebscraping for company: GIS (155/215)

100%|██████████| 21/21 [00:37<00:00,  1.80s/it]


2 errors found
[KWebscraping for company: ADM (156/215)

100%|██████████| 20/20 [00:37<00:00,  1.88s/it]

3 errors found
[KWebscraping for company: CNP (157/215)


100%|██████████| 20/20 [00:47<00:00,  2.39s/it]


6 errors found
[KWebscraping for company: DOV (158/215)

100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


2 errors found
[KWebscraping for company: NTRS (159/215)

100%|██████████| 20/20 [1:00:32<00:00, 181.61s/it]


0 errors found
[KWebscraping for company: EQR (160/215)

100%|██████████| 20/20 [01:41<00:00,  5.10s/it]


0 errors found
[KWebscraping for company: IP (161/215)

100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


0 errors found
[KWebscraping for company: FE (162/215)

100%|██████████| 20/20 [01:48<00:00,  5.41s/it]


7 errors found
[KWebscraping for company: HBAN (163/215)

100%|██████████| 20/20 [01:22<00:00,  4.13s/it]


0 errors found
[KWebscraping for company: PPG (164/215)

100%|██████████| 20/20 [00:27<00:00,  1.37s/it]


0 errors found
[KWebscraping for company: FOXA (165/215)

100%|██████████| 6/6 [00:07<00:00,  1.23s/it]

0 errors found
[KWebscraping for company: DRI (166/215)


100%|██████████| 21/21 [00:15<00:00,  1.39it/s]


2 errors found
[KWebscraping for company: HPQ (167/215)

100%|██████████| 20/20 [00:45<00:00,  2.29s/it]


0 errors found
[KWebscraping for company: CINF (168/215)

100%|██████████| 20/20 [01:12<00:00,  3.63s/it]


5 errors found
[KWebscraping for company: RF (169/215)

100%|██████████| 20/20 [01:12<00:00,  3.63s/it]


5 errors found
[KWebscraping for company: TROW (170/215)

100%|██████████| 20/20 [00:18<00:00,  1.09it/s]

1 errors found
[KWebscraping for company: TPR (171/215)


100%|██████████| 20/20 [00:22<00:00,  1.12s/it]

1 errors found
[KWebscraping for company: PHM (172/215)


100%|██████████| 20/20 [00:29<00:00,  1.46s/it]


0 errors found
[KWebscraping for company: CMS (173/215)

100%|██████████| 20/20 [00:54<00:00,  2.73s/it]


3 errors found
[KWebscraping for company: LH (174/215)

100%|██████████| 21/21 [00:26<00:00,  1.27s/it]


5 errors found
[KWebscraping for company: DVN (175/215)

100%|██████████| 20/20 [00:50<00:00,  2.51s/it]


0 errors found
[KWebscraping for company: NTAP (176/215)

100%|██████████| 21/21 [00:34<00:00,  1.65s/it]


1 errors found
[KWebscraping for company: KEY (177/215)

100%|██████████| 20/20 [01:15<00:00,  3.79s/it]


0 errors found
[KWebscraping for company: NI (178/215)

100%|██████████| 20/20 [00:35<00:00,  1.78s/it]


2 errors found
[KWebscraping for company: EIX (179/215)

100%|██████████| 20/20 [00:51<00:00,  2.58s/it]


1 errors found
[KWebscraping for company: L (180/215)

100%|██████████| 20/20 [01:21<00:00,  4.10s/it]


2 errors found
[KWebscraping for company: MKC (181/215)

100%|██████████| 20/20 [00:21<00:00,  1.06s/it]

12 errors found
[KWebscraping for company: HAL (182/215)


100%|██████████| 20/20 [00:23<00:00,  1.19s/it]


5 errors found
[KWebscraping for company: DGX (183/215)

100%|██████████| 20/20 [00:41<00:00,  2.08s/it]


0 errors found
[KWebscraping for company: BIIB (184/215)

100%|██████████| 20/20 [00:42<00:00,  2.13s/it]


0 errors found
[KWebscraping for company: IFF (185/215)

100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


3 errors found
[KWebscraping for company: ZBH (186/215)

100%|██████████| 20/20 [00:35<00:00,  1.75s/it]


0 errors found
[KWebscraping for company: WY (187/215)

100%|██████████| 20/20 [00:47<00:00,  2.40s/it]


0 errors found
[KWebscraping for company: GPC (188/215)

100%|██████████| 20/20 [00:20<00:00,  1.01s/it]

6 errors found
[KWebscraping for company: PFG (189/215)


100%|██████████| 20/20 [02:00<00:00,  6.02s/it]


0 errors found
[KWebscraping for company: WAT (190/215)

100%|██████████| 20/20 [00:26<00:00,  1.31s/it]


0 errors found
[KWebscraping for company: SNA (191/215)

100%|██████████| 21/21 [00:37<00:00,  1.76s/it]


2 errors found
[KWebscraping for company: LUV (192/215)

100%|██████████| 20/20 [00:26<00:00,  1.34s/it]


0 errors found
[KWebscraping for company: CLX (193/215)

100%|██████████| 20/20 [00:09<00:00,  2.17it/s]

3 errors found
[KWebscraping for company: MAS (194/215)


100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


1 errors found
[KWebscraping for company: OMC (195/215)

100%|██████████| 20/20 [00:21<00:00,  1.08s/it]


5 errors found
[KWebscraping for company: TXT (196/215)

100%|██████████| 21/21 [00:37<00:00,  1.77s/it]


7 errors found
[KWebscraping for company: BBY (197/215)

100%|██████████| 20/20 [00:39<00:00,  1.98s/it]

1 errors found
[KWebscraping for company: AVY (198/215)


100%|██████████| 21/21 [00:15<00:00,  1.34it/s]


2 errors found
[KWebscraping for company: BEN (199/215)

100%|██████████| 20/20 [00:35<00:00,  1.76s/it]


0 errors found
[KWebscraping for company: BAX (200/215)

100%|██████████| 20/20 [00:25<00:00,  1.27s/it]


3 errors found
[KWebscraping for company: PNW (201/215)

100%|██████████| 20/20 [01:01<00:00,  3.09s/it]


0 errors found
[KWebscraping for company: HAS (202/215)

100%|██████████| 20/20 [00:27<00:00,  1.38s/it]


0 errors found
[KWebscraping for company: SWK (203/215)

100%|██████████| 21/21 [00:37<00:00,  1.81s/it]


4 errors found
[KWebscraping for company: WBA (204/215)

100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


1 errors found
[KWebscraping for company: TAP (205/215)

100%|██████████| 20/20 [01:03<00:00,  3.19s/it]


0 errors found
[KWebscraping for company: CPB (206/215)

100%|██████████| 20/20 [00:30<00:00,  1.53s/it]


2 errors found
[KWebscraping for company: AES (207/215)

100%|██████████| 20/20 [01:17<00:00,  3.88s/it]


4 errors found
[KWebscraping for company: IPG (208/215)

100%|██████████| 20/20 [00:37<00:00,  1.87s/it]


0 errors found
[KWebscraping for company: CAG (209/215)

100%|██████████| 21/21 [00:33<00:00,  1.57s/it]


3 errors found
[KWebscraping for company: EMN (210/215)

100%|██████████| 20/20 [00:46<00:00,  2.30s/it]


1 errors found
[KWebscraping for company: APA (211/215)

100%|██████████| 4/4 [00:05<00:00,  1.45s/it]


0 errors found
[KWebscraping for company: FITB (212/215)

100%|██████████| 20/20 [01:11<00:00,  3.59s/it]


13 errors found
[KWebscraping for company: BF.B (213/215)

100%|██████████| 21/21 [00:20<00:00,  1.04it/s]

8 errors found
[KWebscraping for company: HES (214/215)


100%|██████████| 20/20 [00:43<00:00,  2.20s/it]

10 errors found
[KWebscraping for company: PPL (215/215)


100%|██████████| 20/20 [02:53<00:00,  8.69s/it]


0 errors found


In [5]:
print(f'Errors in total: {len(errors)} out of {total_documents} documents ({((len(errors)/total_documents)*100):.3f}%)')
errors.to_csv("webscraping_errors.csv", index = False)

Errors in total: 716 out of 4241 documents (16.883%)


In [14]:
with open("section_1A/PPL.json", "r") as file:
    test_json = json.load(file)

test_json


#pickle file mit Datensatz, columns = [cik, date, section]; Nones rauslassen
#

sec_1A = test_json["2024"]["section_1A"]
sec_1A = set(sec_1A)
sec_1A

{'(',
 '(All Registrants)',
 '(PPL and LG&E)',
 '(PPL and PPL Electric)',
 '(PPL)',
 '(PPL, LG&E and KU)',
 ')',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 'A downgrade in our credit ratings could negatively affect our ability to access capital and increase the cost of maintaining our credit facilities and any new debt.',
 'A pandemic health event and related remediation efforts could present challenges to businesses, communities, workforces, markets and supply chains. At this time, the Registrants cannot predict the ways in which and the extent to which these or other pandemic-related factors may affect their business, earnings or other financial results.',
 'A.',
 'A. Risk Related to Registrant Holding Company',
 'A.Risks Related to Registrant Holding Company',
 'AI technologies are still in their early stages of development and deployment. Ineffective or inadequate AI development or deployment practices by PPL, its subsidiaries or third-party vendors could result in unintende