In [1]:
import random
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from bs4 import BeautifulSoup

def get_edgar_data_selenium(cik):
    """
    Uses Selenium with Firefox to load the SEC EDGAR page for the given CIK.
    Returns the rendered HTML.
    """
    # Set up Firefox options for headless mode
    firefox_options = FirefoxOptions()
    firefox_options.add_argument("--headless")
    firefox_options.add_argument("--no-sandbox")
    firefox_options.add_argument("--disable-dev-shm-usage")
    # Set the binary location (adjust if necessary)
    firefox_options.binary_location = "/Applications/Firefox.app/Contents/MacOS/firefox"
    
    # Create a Service object pointing to geckodriver (adjust the path if needed)
    service = FirefoxService(executable_path="/opt/homebrew/bin/geckodriver")
    
    # Initialize the Firefox WebDriver with the service and options
    driver = webdriver.Firefox(service=service, options=firefox_options)

    # Construct the SEC EDGAR URL for the given CIK (using the classic endpoint)
    url = f"https://www.sec.gov/cgi-bin/browse-edgar?CIK={cik}&owner=exclude&action=getcompany"
    driver.get(url)
    
    # Allow time for the page to load completely
    time.sleep(3)
    
    # Get the rendered HTML
    html = driver.page_source
    driver.quit()
    return html

In [2]:

def get_sic_code_from_html(html):
    """
    Parses the provided HTML using BeautifulSoup and extracts the SIC code.
    It looks for a <span> element with id="SIC" and returns its text.
    """
    soup = BeautifulSoup(html, 'html.parser')
    sic_spans = soup.find_all("span", id="SIC")
    if sic_spans:
        return sic_spans[0].get_text(strip=True)
    return None



In [3]:
def process_company(cik):
    """
    Processes a single company: fetches its EDGAR page using Firefox,
    extracts the SIC code, and returns a tuple (cik, sic_code).
    A random delay is added to avoid overwhelming the server.
    """
    try:
        html_data = get_edgar_data_selenium(cik)
        sic_code = get_sic_code_from_html(html_data)
    except Exception as e:
        print(f"Error processing CIK {cik}: {e}")
        sic_code = None
    # Add a random delay between 1 and 3 seconds
    time.sleep(random.uniform(1, 3))
    return cik, sic_code


In [4]:
companyDataall = pd.read_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/CIK_SIC.csv')
companyDataall['SIC_code'] = companyDataall['SIC_code'].fillna(0)
companyDataall['SIC_code']=companyDataall['SIC_code'].astype(int)
companyData = companyDataall.loc[companyDataall['SIC_code']==0]


In [5]:
def process_batch(batch_df, max_workers=3):
    """
    Processes a batch of companies concurrently.
    Returns a list of tuples (cik, sic_code) for the companies in the batch.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_cik = {executor.submit(process_company, row['cik_str']): row['cik_str'] for _, row in batch_df.iterrows()}
        for future in as_completed(future_to_cik):
            cik, sic_code = future.result()
            results.append((cik, sic_code))
    return results

In [6]:
def process_single_batch(companyData, batch_num, batch_size=1000, max_workers=3):
    """
    Processes a single batch of companies (based on the given batch number).
    Updates the companyData DataFrame with extracted SIC codes for that batch and saves the DataFrame.
    
    Args:
        companyData: The full DataFrame containing company data.
        batch_num: The batch number to process (starting from 0).
        batch_size: Number of companies to process in one batch.
        max_workers: Maximum number of threads for parallel processing.
        
    Returns:
        The updated companyData DataFrame.
    """
    start = batch_num * batch_size
    end = start + batch_size
    batch_df = companyData.iloc[start:end]
    print(f"Processing batch {batch_num} (index {start} to {end})...")
    
    batch_results = process_batch(batch_df, max_workers=max_workers)
    
    # Create a mapping dictionary from the results
    batch_map = {cik: sic for cik, sic in batch_results}
    # Map the extracted SIC codes to the corresponding rows; fill missing values with 0.
    companyData.loc[companyData.index[start:end], 'SIC_code'] = companyData.loc[companyData.index[start:end], 'cik_str'].map(batch_map).fillna(0)
    
    # Save the updated DataFrame to CSV
    companyData.to_csv("companyData_with_SIC_codes.csv", index=False)
    print(f"Batch {batch_num} processed and saved.")
    return companyData

In [None]:
batch_number = 0  # Change this to the desired batch number
companyData = process_single_batch(companyData, batch_number, batch_size=500, max_workers=8)

Processing batch 0 (index 0 to 500)...


In [None]:
companyData.to_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/second.csv',index=False)