In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import os

In [3]:
# load 8k parquet file
df_8k = pd.read_parquet("gs://sec-financials-edgar/edgar_idx_files/filtered/company_index_8k.parquet")

In [4]:
df_8k.head()

Unnamed: 0,Company Name,Form Type,CIK,Date Filed,Filename,Year,Quarter,SourceFile
0,LOUISIANA LAND & EXPLORATION CO,8-K,60512,1993-10-29,edgar/data/60512/0000060512-94-000008.txt,1993,QTR4,1993_QTR4_company.idx
1,3COM CORP,8-K,738076,1994-01-31,edgar/data/738076/0000738076-94-000004.txt,1994,QTR1,1994_QTR1_company.idx
2,3COM CORP,8-K,738076,1994-02-11,edgar/data/738076/0000738076-94-000005.txt,1994,QTR1,1994_QTR1_company.idx
3,ACME METALS INC,8-K,883702,1994-03-07,edgar/data/883702/0000912057-94-000799.txt,1994,QTR1,1994_QTR1_company.idx
4,ADDINGTON RESOURCES INC,8-K,810665,1994-01-28,edgar/data/810665/0000950131-94-000054.txt,1994,QTR1,1994_QTR1_company.idx


In [5]:
HEADERS = {"User-Agent": "Naresh Chethala Research (nchethala@wne.edu)"}

def get_8k_html_link(index_url):
    """
    Fetch an SEC filing index page and extract the direct link
    to the main 8-K HTML document.
    """
    try:
        # 1. Request the index page
        resp = requests.get(index_url, headers=HEADERS, timeout=15)
        if resp.status_code != 200:
            print(f"⚠️ {resp.status_code} for {index_url}")
            return None

        # 2. Parse with BeautifulSoup
        soup = BeautifulSoup(resp.text, "html.parser")

        # 3. Locate the table listing documents
        table = soup.find("table", {"class": "tableFile"})
        if not table:
            print(f"⚠️ No table found on page: {index_url}")
            return None

        # 4. Iterate through rows
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) < 4:
                continue

            doc_link_tag = cols[2].find("a")
            doc_type = cols[3].get_text(strip=True).upper()

            # 5. Look for 8-K document type
            if doc_type == "8-K" and doc_link_tag:
                rel_url = doc_link_tag.get("href")
                full_url = urljoin("https://www.sec.gov", rel_url)
                return full_url

        print(f"⚠️ No 8-K link found in: {index_url}")
        return None

    except Exception as e:
        print(f"⚠️ Error processing {index_url}: {e}")
        return None

In [9]:
# filter df_8k to have the data from the year 2010 only and random sample of 1000 rows
df_sample = df_8k[df_8k["Date Filed"].str.startswith("2010")].sample(n=1000, random_state=1).copy()

In [10]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 936626 to 973523
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  1000 non-null   string
 1   Form Type     1000 non-null   string
 2   CIK           1000 non-null   string
 3   Date Filed    1000 non-null   object
 4   Filename      1000 non-null   string
 5   Year          1000 non-null   int32 
 6   Quarter       1000 non-null   string
 7   SourceFile    1000 non-null   string
dtypes: int32(1), object(1), string(6)
memory usage: 160.9+ KB


In [None]:
# df_sample = df_8k.head(100).copy()

In [None]:
# # ============================================================
# # CONFIG
# # ============================================================
# HEADERS = {"User-Agent": "Naresh Chethala Research (nchethala@wne.edu)"}

# # ============================================================
# # 1️⃣ Filename → SEC index page URL (UNCHANGED LOGIC)
# # ============================================================
# def build_sec_url(filename):
#     """
#     Converts the Filename path from master.idx into a valid SEC filing index URL.
#     Example:
#       input : edgar/data/320187/0000320187-25-000060.txt
#       output: https://www.sec.gov/Archives/edgar/data/320187/000032018725000060/0000320187-25-000060-index.html
#     """
#     try:
#         match = re.search(r"edgar/data/(\d+)/(\d{10}-\d{2}-\d{6})", filename)
#         if not match:
#             return None
#         cik = match.group(1)
#         accession = match.group(2)
#         accession_no_dash = accession.replace("-", "")
#         return (
#             f"https://www.sec.gov/Archives/edgar/data/"
#             f"{cik}/{accession_no_dash}/{accession}-index.html"
#         )
#     except Exception:
#         return None


# # ============================================================
# # 2️⃣ Index page URL → actual 8-K HTML URL (UNCHANGED LOGIC)
# # ============================================================
# def get_8k_html_link(index_url):
#     """
#     Fetch an SEC filing index page and extract the direct link
#     to the main 8-K HTML document.
#     """
#     try:
#         resp = requests.get(index_url, headers=HEADERS, timeout=15)
#         if resp.status_code != 200:
#             return None

#         soup = BeautifulSoup(resp.text, "html.parser")

#         table = soup.find("table", {"class": "tableFile"})
#         if not table:
#             return None

#         for row in table.find_all("tr"):
#             cols = row.find_all("td")
#             if len(cols) < 4:
#                 continue

#             doc_link_tag = cols[2].find("a")
#             doc_type = cols[3].get_text(strip=True).upper()

#             if doc_type == "8-K" and doc_link_tag:
#                 rel_url = doc_link_tag.get("href")
#                 return urljoin("https://www.sec.gov", rel_url)

#         return None

#     except Exception:
#         return None


# # ============================================================
# # 3️⃣ DataFrame helper: Filename → Index URL → 8-K URL
# # ============================================================
# def add_sec_urls_to_df(df, filename_col="Filename"):
#     """
#     Adds two columns to the DataFrame:
#       - Filing_Index_URL
#       - EightK_HTML_URL
#     """
#     df = df.copy()

#     # Step 1: build index URLs
#     df["Filing_Index_URL"] = df[filename_col].apply(build_sec_url)

#     # Drop rows where index URL could not be built
#     df = df.dropna(subset=["Filing_Index_URL"]).reset_index(drop=True)

#     # Step 2: resolve index → 8-K HTML
#     df["EightK_HTML_URL"] = df["Filing_Index_URL"].apply(get_8k_html_link)

#     return df


# # ============================================================
# # Example usage
# # ============================================================
# if __name__ == "__main__":
#     #file_path = "/Users/nareshchethala/Library/CloudStorage/GoogleDrive-nareshchethala99@gmail.com/My Drive/capstone_project/data/sec_idx_files/all_8k_filings.csv"

#     df = df_sample  # pd.read_csv(file_path)

#     df = add_sec_urls_to_df(df)

#     print("✅ Sample resolved URLs:")
#     print(df[["Filename", "Filing_Index_URL", "EightK_HTML_URL"]].head())

In [12]:
print(df["Filing_Index_URL"][0])

https://www.sec.gov/Archives/edgar/data/60512/000006051294000008/0000060512-94-000008-index.html


In [11]:
import pandas as pd
import re

def build_sec_url(filename):
    """
    Converts the Filename path from master.idx into a valid SEC filing URL.
    Example input: 'edgar/data/320187/0000320187-25-000060.txt'
    Output: 'https://www.sec.gov/Archives/edgar/data/320187/000032018725000060/0000320187-25-000060-index.html'
    """
    try:
        match = re.search(r"edgar/data/(\d+)/(\d{10}-\d{2}-\d{6})", filename)
        if not match:
            return None
        cik = match.group(1)
        accession = match.group(2)
        accession_no_dash = accession.replace("-", "")
        url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_no_dash}/{accession}-index.html"
        return url
    except Exception:
        return None


# Example usage:
#file_path = "/Users/nareshchethala/Library/CloudStorage/GoogleDrive-nareshchethala99@gmail.com/My Drive/capstone_project/data/sec_idx_files/all_8k_filings.csv"   # adjust to your path
df = df_sample  # pd.read_csv(file_path)

# Build new Filing URL column using the corrected pattern
df["Filing URL"] = df["Filename"].apply(build_sec_url)

# Remove invalid ones
df = df.dropna(subset=["Filing URL"])

print("✅ Example URLs:")
print(df["Filing URL"].head(5))

✅ Example URLs:
936626    https://www.sec.gov/Archives/edgar/data/81018/...
940649    https://www.sec.gov/Archives/edgar/data/921590...
924893    https://www.sec.gov/Archives/edgar/data/145008...
991644    https://www.sec.gov/Archives/edgar/data/886128...
954323    https://www.sec.gov/Archives/edgar/data/94845/...
Name: Filing URL, dtype: object


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 936626 to 973523
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  1000 non-null   string
 1   Form Type     1000 non-null   string
 2   CIK           1000 non-null   string
 3   Date Filed    1000 non-null   object
 4   Filename      1000 non-null   string
 5   Year          1000 non-null   int32 
 6   Quarter       1000 non-null   string
 7   SourceFile    1000 non-null   string
 8   Filing URL    1000 non-null   object
dtypes: int32(1), object(2), string(6)
memory usage: 168.7+ KB


In [None]:
HEADERS = {"User-Agent": "Naresh Chethala Research (nchethala@wne.edu)"}

def get_8k_html_link(index_url):
    """
    Fetch an SEC filing index page and extract the direct link
    to the main 8-K HTML document.
    """
    try:
        # 1. Request the index page
        resp = requests.get(index_url, headers=HEADERS, timeout=15)
        if resp.status_code != 200:
            print(f"⚠️ {resp.status_code} for {index_url}")
            return None

        # 2. Parse with BeautifulSoup
        soup = BeautifulSoup(resp.text, "html.parser")

        # 3. Locate the table listing documents
        table = soup.find("table", {"class": "tableFile"})
        if not table:
            print(f"⚠️ No table found on page: {index_url}")
            return None

        # 4. Iterate through rows
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) < 4:
                continue

            doc_link_tag = cols[2].find("a")
            doc_type = cols[3].get_text(strip=True).upper()

            # 5. Look for 8-K document type
            if doc_type == "8-K" and doc_link_tag:
                rel_url = doc_link_tag.get("href")
                full_url = urljoin("https://www.sec.gov", rel_url)
                return full_url

        print(f"⚠️ No 8-K link found in: {index_url}")
        return None

    except Exception as e:
        print(f"⚠️ Error processing {index_url}: {e}")
        return None

In [21]:
df.head()

Unnamed: 0,Company Name,Form Type,CIK,Date Filed,Filename,Year,Quarter,SourceFile,Filing URL
0,LOUISIANA LAND & EXPLORATION CO,8-K,60512,1993-10-29,edgar/data/60512/0000060512-94-000008.txt,1993,QTR4,1993_QTR4_company.idx,https://www.sec.gov/Archives/edgar/data/60512/...
1,3COM CORP,8-K,738076,1994-01-31,edgar/data/738076/0000738076-94-000004.txt,1994,QTR1,1994_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/738076...
2,3COM CORP,8-K,738076,1994-02-11,edgar/data/738076/0000738076-94-000005.txt,1994,QTR1,1994_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/738076...
3,ACME METALS INC,8-K,883702,1994-03-07,edgar/data/883702/0000912057-94-000799.txt,1994,QTR1,1994_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/883702...
4,ADDINGTON RESOURCES INC,8-K,810665,1994-01-28,edgar/data/810665/0000950131-94-000054.txt,1994,QTR1,1994_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/810665...


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  50 non-null     string
 1   Form Type     50 non-null     string
 2   CIK           50 non-null     string
 3   Date Filed    50 non-null     object
 4   Filename      50 non-null     string
 5   Year          50 non-null     int32 
 6   Quarter       50 non-null     string
 7   SourceFile    50 non-null     string
 8   Filing URL    50 non-null     object
dtypes: int32(1), object(2), string(6)
memory usage: 8.1+ KB


In [None]:
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

HEADERS = {"User-Agent": "Naresh Chethala Research (nchethala@wne.edu)"}

def get_8k_html_link(index_url):
    """
    Fetch an SEC filing index page and extract the direct link
    to the filing content, handling both:
      A) Rows where Type is '8-K'
      B) Rows where 'Complete submission text file' contains the .txt link
    """

    try:
        # ✅ SEC-friendly delay BEFORE request
        time.sleep(2)

        resp = requests.get(index_url, headers=HEADERS, timeout=15)
        if resp.status_code != 200:
            print(f"⚠️ {resp.status_code} for {index_url}")
            return None

        soup = BeautifulSoup(resp.text, "html.parser")
        table = soup.find("table", {"class": "tableFile"})
        if not table:
            print(f"⚠️ No table found on page: {index_url}")
            return None

        best_txt_link = None
        best_8k_link = None

        for tr in table.find_all("tr"):
            cols = tr.find_all("td")
            if not cols:
                continue  # skip header row

            # Expected structure:
            # [Seq, Description, Document, Type, Size]
            desc = cols[1].get_text(" ", strip=True).upper() if len(cols) > 1 else ""
            doc_a = cols[2].find("a") if len(cols) > 2 else None
            doc_href = doc_a.get("href") if doc_a else None
            doc_type = cols[3].get_text(" ", strip=True).upper() if len(cols) > 3 else ""

            if not doc_href:
                continue

            full_url = urljoin("https://www.sec.gov", doc_href)

            # Preferred: complete submission text file (.txt)
            if "COMPLETE SUBMISSION TEXT FILE" in desc and doc_href.lower().endswith(".txt"):
                print(f"✅ Found complete submission .txt for {index_url}")
                return full_url  # strongest match

            # Fallback: any 8-K typed document
            if doc_type.startswith("8-K") and best_8k_link is None:
                best_8k_link = full_url

        if best_8k_link:
            print(f"✅ Found 8-K link for {index_url}")
            return best_8k_link

        print(f"⚠️ No 8-K link found in: {index_url}")
        return None

    except Exception as e:
        print(f"⚠️ Error processing {index_url}: {e}")
        return None

In [14]:
# apply get_8k_html_link to the Filing URL column
df["8K_HTML_Link"] = df["Filing URL"].apply(get_8k_html_link)

✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/81018/000110465910017591/0001104659-10-017591-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/921590/000100210510000068/0001002105-10-000068-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/1450088/000147793210000096/0001477932-10-000096-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/886128/000095012310113709/0000950123-10-113709-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/94845/000095012310051938/0000950123-10-051938-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/731653/000095012310070242/0000950123-10-070242-index.html
✅ Found complete submission .txt for https://www.sec.gov/Archives/edgar/data/1362988/000095012310089217/0000950123-10-089217-index.html
✅ Found complete submission .txt for https://www.sec.go

In [15]:
df.head()

Unnamed: 0,Company Name,Form Type,CIK,Date Filed,Filename,Year,Quarter,SourceFile,Filing URL,8K_HTML_Link
936626,PUBLIC SERVICE CO OF COLORADO,8-K,81018,2010-03-31,edgar/data/81018/0001104659-10-017591.txt,2010,QTR1,2010_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/81018/...,https://www.sec.gov/Archives/edgar/data/72903/...
940649,VALLEY FINANCIAL CORP /VA/,8-K,921590,2010-03-26,edgar/data/921590/0001002105-10-000068.txt,2010,QTR1,2010_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/921590...,https://www.sec.gov/Archives/edgar/data/921590...
924893,"CHARTER CORPORATE SERVICES, INC.",8-K,1450088,2010-02-25,edgar/data/1450088/0001477932-10-000096.txt,2010,QTR1,2010_QTR1_company.idx,https://www.sec.gov/Archives/edgar/data/145008...,https://www.sec.gov/Archives/edgar/data/145008...
991644,FUELCELL ENERGY INC,8-K,886128,2010-12-15,edgar/data/886128/0000950123-10-113709.txt,2010,QTR4,2010_QTR4_company.idx,https://www.sec.gov/Archives/edgar/data/886128...,https://www.sec.gov/Archives/edgar/data/886128...
954323,LEVI STRAUSS & CO,8-K,94845,2010-05-21,edgar/data/94845/0000950123-10-051938.txt,2010,QTR2,2010_QTR2_company.idx,https://www.sec.gov/Archives/edgar/data/94845/...,https://www.sec.gov/Archives/edgar/data/94845/...


In [22]:
# print one random 8-K HTML link
print(df["8K_HTML_Link"].sample(n=1, random_state=12).iloc[0])

https://www.sec.gov/Archives/edgar/data/100122/000095012310096305/0000950123-10-096305.txt


In [31]:
print(df["8K_HTML_Link"][0])

https://www.sec.gov/Archives/edgar/data/60512/0000060512-94-000008.txt
