In [37]:
import pandas as pd
df = pd.read_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers.csv')

In [38]:
df.shape

(9732, 4)

In [39]:
df['SIC_code'] = df['SIC_code'].fillna(0).astype(float).astype(int)

In [40]:
dfzero = df.loc[df['SIC_code'] ==0]

In [41]:
dfzero.shape

(858, 4)

In [3]:
df.to_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers2.csv', index=False)

In [2]:
nosic = df[(df['SIC_code'].isnull())]
nosic.shape

(1563, 4)

In [2]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from getcikstoparse import get_missing_sic_rows

# Setup user-agent header
headers = {'User-Agent': "mbambal@purdue.edu"}

def get_edgar_data_selenium(cik):
    driver = None
    try:
        options = FirefoxOptions()
        options.add_argument("--headless")
        options.binary_location = "/Applications/Firefox.app/Contents/MacOS/firefox"

        service = FirefoxService(executable_path="/opt/homebrew/bin/geckodriver")
        driver = webdriver.Firefox(service=service, options=options)

        url = f"https://www.sec.gov/cgi-bin/browse-edgar?CIK={cik}&owner=exclude&action=getcompany"
        driver.get(url)
        time.sleep(3)
        return driver.page_source

    except Exception as e:
        print(f"Error processing CIK {cik}: {e}")
        return None

    finally:
        if driver:
            try:
                driver.quit()
            except Exception:
                pass

def get_sic_code_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    sic_spans = soup.find_all("span", id="SIC")
    if sic_spans:
        return sic_spans[0].get_text(strip=True)
    return None

def process_company(cik):
    html_data = get_edgar_data_selenium(cik)
    sic_code = get_sic_code_from_html(html_data) if html_data else None
    time.sleep(random.uniform(1, 3))  # Avoid hitting SEC too fast
    return cik, sic_code

def process_single_batch(companyData, batch_num, batch_size=1000):
    start = batch_num * batch_size
    end = start + batch_size
    batch_df = companyData.iloc[start:end].copy()
    print(f"\nProcessing batch {batch_num} (index {start} to {end})...\n")

    results = []
    total = len(batch_df)
    correct = 0

    for i, row in batch_df.iterrows():
        cik = row['cik_str']
        cik, sic_code = process_company(cik)

        # If parsing failed, store 0
        if not sic_code:
            sic_code = 0
        else:
            correct += 1

        companyData.at[i, 'SIC_code'] = sic_code
        results.append((cik, sic_code))

        # Show progress
        percent_complete = ((i - start + 1) / total) * 100
        percent_correct = (correct / (i - start + 1)) * 100
        print(f"Progress: {percent_complete:.2f}% | Correct: {percent_correct:.2f}%", end="\r")

    # Save result to CSV
    save_path = rf"/Users/mayankbambal/Desktop/SIC Scraper/data/stagging\Data_with_SIC_codes{batch_num}.csv"
    companyData.to_csv(save_path, index=False)
    print(f"\n\nBatch {batch_num} completed. Saved to:\n{save_path}\n")
    return companyData


In [3]:
find_sic = get_missing_sic_rows('/Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers.csv')
find_sic.shape

(2093, 4)

In [4]:
# Example loop to run multiple batches
for batch_number in [0, 1, 2, 3]:
    companyData = process_single_batch(find_sic, batch_number, batch_size=500)


Processing batch 0 (index 0 to 500)...

Progress: 6.20% | Correct: 0.00%

KeyboardInterrupt: 

In [27]:
df = pd.read_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers.csv')

In [28]:
df['SIC_code'] = df['SIC_code'].fillna(0)
df['SIC_code'] = df['SIC_code'].astype(int)
df['SIC_code'] = df['SIC_code'].astype(str)


In [29]:
nosci= df[df['SIC_code'] == '0']

In [30]:
nosci.shape

(3157, 4)

In [None]:
df.to_csv('/Users/mayankbambal/Desktop/SIC Scraper/data/final/CIK_SIC_mapping.csv', index=False)

In [23]:
import pandas as pd

def update_sic_from_file(file1, file2):
    """
    Updates SIC_code in file1 using non-zero SIC_code values from file2,
    matching on cik_str. Only non-zero SIC_codes from file2 are applied.
    file1 is updated in-place.

    Args:
        file1 (str): Path to the main CSV file to update (will be modified).
        file2 (str): Path to the CSV file containing updated SIC_codes.
    """
    # Read both files
    df_main = pd.read_csv(file1)
    df_sic = pd.read_csv(file2)

    # Ensure cik_str is properly formatted
    df_main['cik_str'] = df_main['cik_str'].astype(str).str.zfill(10)
    df_sic['cik_str'] = df_sic['cik_str'].astype(str).str.zfill(10)

    # Keep only non-zero SIC_code rows from file2
    df_sic_nonzero = df_sic[df_sic['SIC_code'] != 0][['cik_str', 'SIC_code']]

    # Set index for fast lookup
    df_sic_nonzero.set_index('cik_str', inplace=True)

    # Update SIC_code in df_main
    def get_updated_sic(row):
        return df_sic_nonzero.loc[row['cik_str'], 'SIC_code'] \
            if row['cik_str'] in df_sic_nonzero.index else row.get('SIC_code', 0)

    df_main['SIC_code'] = df_main.apply(get_updated_sic, axis=1)

    # Save back to file1
    df_main.to_csv(file1, index=False)
    print(f"Updated SIC_code values saved to: {file1}")

In [26]:
update_sic_from_file('/Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers.csv', '/Users/mayankbambal/Desktop/SIC Scraper/data/final/master.csv')

Updated SIC_code values saved to: /Users/mayankbambal/Desktop/SIC Scraper/data/final/company_tickers.csv
