In [8]:
import os

os.environ["FMP_API_KEY"] = "XokeN0fDHyRtoZaB9HbVXCNd3uv5Piw4"  

print(os.environ["FMP_API_KEY"])


XokeN0fDHyRtoZaB9HbVXCNd3uv5Piw4


In [9]:
import os
import time
import requests
import pandas as pd

API_KEY = os.getenv("FMP_API_KEY")
if not API_KEY:
    raise EnvironmentError(
        "Environment variable FMP_API_KEY is not set. "
        "Run:  export FMP_API_KEY='YOUR_KEY_HERE'"
    )

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
PROFILE_URL = "https://financialmodelingprep.com/api/v3/profile/{symbol}?apikey={key}"
OUTPUT_FILE = "/workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.xlsx"

DELAY_SEC = 0.01

# pandas.read_html returns every table on the page; the first one is the constituents table
sp500_table = pd.read_html(WIKI_URL, flavor="lxml")[0]  
tickers = sp500_table["Symbol"].tolist()

records = []

for symbol in tickers:
    # URL-encode the dot just in case for B class (BRK.B) instances
    symbol_encoded = symbol.replace(".", "%2E")
    url = PROFILE_URL.format(symbol=symbol_encoded, key=API_KEY)

    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        data = resp.json()
        if data:                         
            d = data[0]
            records.append(
                {
                    "ticker": d.get("symbol"),
                    "description": d.get("description"),
                    "industry": d.get("industry"),
                    "sector": d.get("sector"),
                    "beta": d.get("beta"),
                    "market_cap": d.get("mktCap"),
                }
            )
    except Exception as exc:
        # print to console but keep going
        print(f"{symbol}: {exc}")
    finally:
        time.sleep(DELAY_SEC)

# ------------------------------------------------------------------ #
# 3.  Save to Excel
# ------------------------------------------------------------------ #
df = pd.DataFrame(records)
df.to_excel(OUTPUT_FILE, index=False)
print(f"Saved {len(df):,} rows to {OUTPUT_FILE}")


Saved 501 rows to /workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.xlsx


In [None]:
# Can't install data wrangler for some reason so converting to csv so I can view easily
df.to_csv(OUTPUT_FILE.replace(".xlsx", ".csv"), index=False)
print(f"Saved {len(df):,} rows to {OUTPUT_FILE.replace('.xlsx', '.csv')}")

Saved 501 rows to /workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.csv


In [None]:
import pandas as pd

CSV_FILE = "sp500_company_profiles.csv"   

df = pd.read_csv(CSV_FILE)

# ------------------------------------------------------------------
# Get the unique values (dropna() discards blanks or NaNs)
# ------------------------------------------------------------------
unique_sectors    = sorted(df["sector"].dropna().unique())
unique_industries = sorted(df["industry"].dropna().unique())

# ------------------------------------------------------------------
# Results
# ------------------------------------------------------------------
print("Unique Sectors (" + str(len(unique_sectors)) + "):")
for s in unique_sectors:
    print("  •", s)

print("\n Unique Industries (" + str(len(unique_industries)) + "):")
for ind in unique_industries:
    print("  •", ind)


Unique Sectors (11):
  • Basic Materials
  • Communication Services
  • Consumer Cyclical
  • Consumer Defensive
  • Energy
  • Financial Services
  • Healthcare
  • Industrials
  • Real Estate
  • Technology
  • Utilities

 Unique Industries (117):
  • Advertising Agencies
  • Aerospace & Defense
  • Agricultural - Machinery
  • Agricultural Farm Products
  • Agricultural Inputs
  • Airlines, Airports & Air Services
  • Apparel - Footwear & Accessories
  • Apparel - Manufacturers
  • Apparel - Retail
  • Asset Management
  • Asset Management - Global
  • Auto - Dealerships
  • Auto - Manufacturers
  • Auto - Parts
  • Banks - Diversified
  • Banks - Regional
  • Beverages - Alcoholic
  • Beverages - Non-Alcoholic
  • Beverages - Wineries & Distilleries
  • Biotechnology
  • Business Equipment & Supplies
  • Chemicals
  • Chemicals - Specialty
  • Communication Equipment
  • Computer Hardware
  • Conglomerates
  • Construction
  • Construction Materials
  • Consulting Services
  • Cons

In [15]:
pip install pyarrow fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
import os
import time
import requests
import pandas as pd
from pathlib import Path

# -------------------------------------------------------------------
# 0.  Configuration
# -------------------------------------------------------------------
API_KEY = os.getenv("FMP_API_KEY")
if not API_KEY:
    raise EnvironmentError("FMP_API_KEY environment variable not found.")

TICKER_CSV = "sp500_company_profiles.csv"

# Base URLs for the three statement endpoints, quarterly frequency
ENDPOINTS = {
    "income_statements": (
        "https://financialmodelingprep.com/api/v3/income-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
    "balance_sheet_statements": (
        "https://financialmodelingprep.com/api/v3/balance-sheet-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
    "cash_flow_statements": (
        "https://financialmodelingprep.com/api/v3/cash-flow-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
}

# Respect the 750-requests-per-minute limit (≈12.5 req/s); 0.1 s keeps us at 600 req/min
DELAY_SEC = 0.10

# -------------------------------------------------------------------
# 1.  Create output folders
# -------------------------------------------------------------------
for folder in ENDPOINTS:
    Path(folder).mkdir(exist_ok=True)

# -------------------------------------------------------------------
# 2.  Load tickers
# -------------------------------------------------------------------
tickers = pd.read_csv(TICKER_CSV)["ticker"].dropna().unique()

# -------------------------------------------------------------------
# 3.  Helper – fetch JSON safely
# -------------------------------------------------------------------
def fetch_statement(url: str) -> list[dict]:
    """Return JSON payload (list of dicts) or an empty list on error."""
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        data = r.json()
        if isinstance(data, list):
            return data
    except Exception as err:
        print(f"Request failed: {url}  ({err})")
    return []

# -------------------------------------------------------------------
# 4.  Main loop
# -------------------------------------------------------------------
for ticker in tickers:
    # Encode “BRK.B”, “BF.B”, etc. for the URL
    encoded = ticker.replace(".", "%2E")

    for folder, template in ENDPOINTS.items():
        url = template.format(ticker=encoded, key=API_KEY)
        records = fetch_statement(url)

        # Skip if FMP returned nothing (some tickers have partial coverage)
        if not records:
            continue

        # Convert to DataFrame, sort oldest→newest, and write Parquet
        df = pd.DataFrame(records).sort_values("date")
        outfile = Path(folder, f"{ticker}.parquet")
        df.to_parquet(outfile, index=False)

        time.sleep(DELAY_SEC)  # stay within rate limit

print("Done. Parquet files are in:")
for folder in ENDPOINTS:
    print(" ", folder)


Done. Parquet files are in:
  income_statements
  balance_sheet_statements
  cash_flow_statements
