In [1]:
import requests
import pandas as pd
import json
import time
import random

def fetch_sec_facts_by_sic(company_data, sic_code, headers, user_start, user_end, limit=None):
    """
    Fetch SEC XBRL company facts for companies with a specific SIC code.

    Args:
        company_data (pd.DataFrame): DataFrame with company info including 'SIC' and 'cik_str' columns.
        sic_code (int): The SIC code to filter companies by.
        headers (dict): Headers to use for the SEC API request (must include User-Agent).
        user_start (int): Starting fiscal year to filter data.
        user_end (int): Ending fiscal year to filter data.
        limit (int, optional): Limit the number of CIKs to fetch. Default is None (fetch all).

    Returns:
        pd.DataFrame: Filtered DataFrame containing SEC facts.
    """
    filtered_cik_list = company_data[company_data['SIC_code'] == sic_code]['cik_str'].tolist()
    ciks = filtered_cik_list if limit is None else filtered_cik_list[:limit]

    all_data = []

    for cik in ciks:
        try:
            response = requests.get(
                f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
                headers=headers
            )
            response.raise_for_status()
            data_dict = response.json()
            cik_val = data_dict.get('cik')
            entityName = data_dict.get('entityName')

            for taxonomy, fact_group in data_dict.get('facts', {}).items():
                for fact_name, fact_data in fact_group.items():
                    label = fact_data.get('label')
                    description = fact_data.get('description')
                    units_dict = fact_data.get('units', {})
                    for unit, records in units_dict.items():
                        for record in records:
                            row = {
                                'cik': cik_val,
                                'entityName': entityName,
                                'taxonomy': taxonomy,
                                'fact_name': fact_name,
                                'label': label,
                                'description': description,
                                'unit': unit
                            }
                            row.update(record)
                            all_data.append(row)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for CIK {cik}: {e}")
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error processing data for CIK {cik}: {e}")

        time.sleep(random.uniform(1, 2))  # Polite delay

    df = pd.DataFrame(all_data)

    # Convert 'start' and 'end' columns to datetime
    for col in ['start', 'end']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # Apply filtering conditions
    if not df.empty:
        df = df[
            (df['fy'] >= user_start) &
            (df['fy'] <= user_end) &
            ((df['end'] - df['start']).dt.days > 350) &
            (df['end'].dt.year == df['fy'])
        ]

    return df

In [2]:
import requests
import pandas as pd

def get_company_data_with_sic(sic_csv_path, headers=None):
    """
    Fetch SEC company ticker data and merge it with SIC codes from a local CSV.

    Args:
        sic_csv_path (str): Path to the CSV file that contains 'cik_str' and 'SIC_code'.
        headers (dict, optional): Headers for SEC API requests. Defaults to standard User-Agent.

    Returns:
        pd.DataFrame: Merged DataFrame with SEC ticker data and SIC codes.
    """
    if headers is None:
        headers = {'User-Agent': 'your.email@example.com'}

    try:
        # Fetch company tickers from SEC
        response = requests.get(
            "https://www.sec.gov/files/company_tickers.json",
            headers=headers
        )
        response.raise_for_status()

        # Load into DataFrame
        company_data = pd.DataFrame.from_dict(response.json(), orient='index')
        company_data['cik_str'] = company_data['cik_str'].astype(str).str.zfill(10)

        # Load and clean your local SIC mapping
        sic_mapping = pd.read_csv(sic_csv_path)
        sic_mapping['cik_str'] = sic_mapping['cik_str'].astype(str).str.zfill(10)
        sic_mapping['SIC_code'] = (
            sic_mapping['SIC_code']
            .fillna(0)
            .astype(float)
            .astype(int)
            .astype(str)
        )

        # Merge on cik_str
        merged_data = pd.merge(company_data, sic_mapping, on='cik_str', how='left')
        return merged_data

    except (requests.RequestException, ValueError, KeyError) as e:
        print(f"Error retrieving or processing company data: {e}")
        return pd.DataFrame()


In [4]:
headers = {'User-Agent': "mbambal@purdue.edu"}

sic_mapping_file = '/Users/mayankbambal/Desktop/10K_API/data/mapping/company_tickers.csv'

company_data = get_company_data_with_sic(sic_mapping_file, headers)

# Get user input

sic_code = input("Enter SIC code: ")
user_start = int(input("Enter start period (YYYY): "))
user_end = int(input("Enter end period (YYYY): "))

#Download data

data_10k = fetch_sec_facts_by_sic(company_data, sic_code, headers, user_start, user_end, limit=5)


# Save data to csv
data_10k.to_csv(f'{sic_code}_{user_start}_{user_end}data_10k.csv', index=False)