In [2]:
import random
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from bs4 import BeautifulSoup

def get_edgar_data_selenium(cik):
    """
    Uses Selenium with Firefox to load the SEC EDGAR page for the given CIK.
    Returns the rendered HTML.
    """
    # Set up Firefox options for headless mode
    firefox_options = FirefoxOptions()
    firefox_options.add_argument("--headless")
    firefox_options.add_argument("--no-sandbox")
    firefox_options.add_argument("--disable-dev-shm-usage")
    # Set the binary location (adjust if necessary)
    firefox_options.binary_location = "/Applications/Firefox.app/Contents/MacOS/firefox"
    
    # Create a Service object pointing to geckodriver (adjust the path if needed)
    service = FirefoxService(executable_path="/opt/homebrew/bin/geckodriver")
    
    # Initialize the Firefox WebDriver with the service and options
    driver = webdriver.Firefox(service=service, options=firefox_options)
    
    # Construct the SEC EDGAR URL for the given CIK (using the classic endpoint)
    url = f"https://www.sec.gov/cgi-bin/browse-edgar?CIK={cik}&owner=exclude&action=getcompany"
    driver.get(url)
    
    # Allow time for the page to load completely
    time.sleep(3)
    
    # Get the rendered HTML
    html = driver.page_source
    driver.quit()
    return html

In [3]:

def get_sic_code_from_html(html):
    """
    Parses the provided HTML using BeautifulSoup and extracts the SIC code.
    It looks for a <span> element with id="SIC" and returns its text.
    """
    soup = BeautifulSoup(html, 'html.parser')
    sic_spans = soup.find_all("span", id="SIC")
    if sic_spans:
        return sic_spans[0].get_text(strip=True)
    return None



In [4]:
def process_company(cik):
    """
    Processes a single company: fetches its EDGAR page using Firefox,
    extracts the SIC code, and returns a tuple (cik, sic_code).
    A random delay is added to avoid overwhelming the server.
    """
    try:
        html_data = get_edgar_data_selenium(cik)
        sic_code = get_sic_code_from_html(html_data)
    except Exception as e:
        print(f"Error processing CIK {cik}: {e}")
        sic_code = None
    # Add a random delay between 1 and 3 seconds
    time.sleep(random.uniform(1, 3))
    return cik, sic_code


In [5]:
companyDataall = pd.read_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/companyData.csv')
companyDatasecond = pd.read_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/second.csv')


In [10]:
data1 = companyDataall.iloc[0:647,:]
datanew = pd.concat([data1,companyDatasecond], axis=0)

In [11]:
datanew.to_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/CIK_SIC.csv',index=False)

In [7]:
companyDatasecond.tail()

Unnamed: 0,cik_str,ticker,title,SIC_code
9080,2029769,OACCU,Oaktree Acquisition Corp. III Life Sciences,
9081,2029769,OACCW,Oaktree Acquisition Corp. III Life Sciences,
9082,1555812,CPXTF,Siam Makro Public Co Limited/ADR,
9083,1546538,BTGRF,BTS Group Holdings Public Co Limited/ADR,8880.0
9084,1546538,BTLWF,BTS Group Holdings Public Co Limited/ADR,8880.0


In [5]:
# Use ThreadPoolExecutor to process companies concurrently
results = []
max_workers = 16  # Adjust number of workers based on your system
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_cik = {executor.submit(process_company, cik): cik for cik in companyData['cik_str']}
    for future in as_completed(future_to_cik):
        cik, sic_code = future.result()
        results.append((cik, sic_code))
        print(f"Processed CIK {cik} -> SIC code: {sic_code}")

# Map the extracted SIC codes back to the DataFrame
sic_map = {cik: sic for cik, sic in results}
companyData["SIC_code"] = companyData["cik_str"].map(sic_map)

Processed CIK 14693 -> SIC code: 2080
Processed CIK 1552033 -> SIC code: 7320
Processed CIK 1564708 -> SIC code: 2711
Processed CIK 352541 -> SIC code: 4931
Processed CIK 48465 -> SIC code: 2011
Processed CIK 1403568 -> SIC code: 5990
Processed CIK 1027664 -> SIC code: 3728
Processed CIK 1567094 -> SIC code: 3531
Processed CIK 74208 -> SIC code: 6798
Processed CIK 1474432 -> SIC code: 3572
Processed CIK 1677250 -> SIC code: 4210
Processed CIK 105770 -> SIC code: 3841
Processed CIK 1013237 -> SIC code: 7370
Processed CIK 904851 -> SIC code: 2911
Processed CIK 1754581 -> SIC code: 6211
Processed CIK 1263043 -> SIC code: 6021
Processed CIK 711404 -> SIC code: 3851
Processed CIK 1089063 -> SIC code: 5940
Processed CIK 746515 -> SIC code: 4731
Processed CIK 1333986 -> SIC code: 6411
Processed CIK 1528396 -> SIC code: 7372
Processed CIK 889132 -> SIC code: 3312
Processed CIK 1439124 -> SIC code: 4911
Processed CIK 1001474 -> SIC code: 4813
Processed CIK 106040 -> SIC code: 3572
Processed CIK

KeyboardInterrupt: 

Error processing CIK 885740: HTTPConnectionPool(host='localhost', port=65497): Max retries exceeded with url: /session/bce532cd-e327-4b31-9fad-b19e60fa53ac/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x12781ea50>: Failed to establish a new connection: [Errno 61] Connection refused'))


In [6]:
sic_map = {cik: sic for cik, sic in results}
companyData["SIC_code"] = companyData["cik_str"].map(sic_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  companyData["SIC_code"] = companyData["cik_str"].map(sic_map)


In [9]:
companyData.to_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/second.csv',index=False)