In [8]:
import os

os.environ["FMP_API_KEY"] = "XokeN0fDHyRtoZaB9HbVXCNd3uv5Piw4"  

print(os.environ["FMP_API_KEY"])


XokeN0fDHyRtoZaB9HbVXCNd3uv5Piw4


In [9]:
import os
import time
import requests
import pandas as pd

API_KEY = os.getenv("FMP_API_KEY")
if not API_KEY:
    raise EnvironmentError(
        "Environment variable FMP_API_KEY is not set. "
        "Run:  export FMP_API_KEY='YOUR_KEY_HERE'"
    )

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
PROFILE_URL = "https://financialmodelingprep.com/api/v3/profile/{symbol}?apikey={key}"
OUTPUT_FILE = "/workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.xlsx"

DELAY_SEC = 0.01

# pandas.read_html returns every table on the page; the first one is the constituents table
sp500_table = pd.read_html(WIKI_URL, flavor="lxml")[0]  
tickers = sp500_table["Symbol"].tolist()

records = []

for symbol in tickers:
    # URL-encode the dot just in case for B class (BRK.B) instances
    symbol_encoded = symbol.replace(".", "%2E")
    url = PROFILE_URL.format(symbol=symbol_encoded, key=API_KEY)

    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        data = resp.json()
        if data:                         
            d = data[0]
            records.append(
                {
                    "ticker": d.get("symbol"),
                    "description": d.get("description"),
                    "industry": d.get("industry"),
                    "sector": d.get("sector"),
                    "beta": d.get("beta"),
                    "market_cap": d.get("mktCap"),
                }
            )
    except Exception as exc:
        # print to console but keep going
        print(f"{symbol}: {exc}")
    finally:
        time.sleep(DELAY_SEC)

# ------------------------------------------------------------------ #
# 3.  Save to Excel
# ------------------------------------------------------------------ #
df = pd.DataFrame(records)
df.to_excel(OUTPUT_FILE, index=False)
print(f"Saved {len(df):,} rows to {OUTPUT_FILE}")


Saved 501 rows to /workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.xlsx


In [None]:
# Can't install data wrangler for some reason so converting to csv so I can view easily
df.to_csv(OUTPUT_FILE.replace(".xlsx", ".csv"), index=False)
print(f"Saved {len(df):,} rows to {OUTPUT_FILE.replace('.xlsx', '.csv')}")

Saved 501 rows to /workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.csv


In [None]:
import pandas as pd

CSV_FILE = "sp500_company_profiles.csv"   

df = pd.read_csv(CSV_FILE)

# ------------------------------------------------------------------
# Get the unique values (dropna() discards blanks or NaNs)
# ------------------------------------------------------------------
unique_sectors    = sorted(df["sector"].dropna().unique())
unique_industries = sorted(df["industry"].dropna().unique())

# ------------------------------------------------------------------
# Results
# ------------------------------------------------------------------
print("Unique Sectors (" + str(len(unique_sectors)) + "):")
for s in unique_sectors:
    print("  •", s)

print("\n Unique Industries (" + str(len(unique_industries)) + "):")
for ind in unique_industries:
    print("  •", ind)


Unique Sectors (11):
  • Basic Materials
  • Communication Services
  • Consumer Cyclical
  • Consumer Defensive
  • Energy
  • Financial Services
  • Healthcare
  • Industrials
  • Real Estate
  • Technology
  • Utilities

 Unique Industries (117):
  • Advertising Agencies
  • Aerospace & Defense
  • Agricultural - Machinery
  • Agricultural Farm Products
  • Agricultural Inputs
  • Airlines, Airports & Air Services
  • Apparel - Footwear & Accessories
  • Apparel - Manufacturers
  • Apparel - Retail
  • Asset Management
  • Asset Management - Global
  • Auto - Dealerships
  • Auto - Manufacturers
  • Auto - Parts
  • Banks - Diversified
  • Banks - Regional
  • Beverages - Alcoholic
  • Beverages - Non-Alcoholic
  • Beverages - Wineries & Distilleries
  • Biotechnology
  • Business Equipment & Supplies
  • Chemicals
  • Chemicals - Specialty
  • Communication Equipment
  • Computer Hardware
  • Conglomerates
  • Construction
  • Construction Materials
  • Consulting Services
  • Cons

In [15]:
pip install pyarrow fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
import os
import time
import requests
import pandas as pd
from pathlib import Path

# -------------------------------------------------------------------
# 0.  Configuration
# -------------------------------------------------------------------
API_KEY = os.getenv("FMP_API_KEY")
if not API_KEY:
    raise EnvironmentError("FMP_API_KEY environment variable not found.")

TICKER_CSV = "sp500_company_profiles.csv"

# Base URLs for the three statement endpoints, quarterly frequency
ENDPOINTS = {
    "income_statements": (
        "https://financialmodelingprep.com/api/v3/income-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
    "balance_sheet_statements": (
        "https://financialmodelingprep.com/api/v3/balance-sheet-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
    "cash_flow_statements": (
        "https://financialmodelingprep.com/api/v3/cash-flow-statement/"
        "{ticker}?period=quarter&limit=400&apikey={key}"
    ),
}

# Respect the 750-requests-per-minute limit (≈12.5 req/s); 0.1 s keeps us at 600 req/min
DELAY_SEC = 0.10

# -------------------------------------------------------------------
# 1.  Create output folders
# -------------------------------------------------------------------
for folder in ENDPOINTS:
    Path(folder).mkdir(exist_ok=True)

# -------------------------------------------------------------------
# 2.  Load tickers
# -------------------------------------------------------------------
tickers = pd.read_csv(TICKER_CSV)["ticker"].dropna().unique()

# -------------------------------------------------------------------
# 3.  Helper – fetch JSON safely
# -------------------------------------------------------------------
def fetch_statement(url: str) -> list[dict]:
    """Return JSON payload (list of dicts) or an empty list on error."""
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        data = r.json()
        if isinstance(data, list):
            return data
    except Exception as err:
        print(f"Request failed: {url}  ({err})")
    return []

# -------------------------------------------------------------------
# 4.  Main loop
# -------------------------------------------------------------------
for ticker in tickers:
    # Encode “BRK.B”, “BF.B”, etc. for the URL
    encoded = ticker.replace(".", "%2E")

    for folder, template in ENDPOINTS.items():
        url = template.format(ticker=encoded, key=API_KEY)
        records = fetch_statement(url)

        # Skip if FMP returned nothing (some tickers have partial coverage)
        if not records:
            continue

        # Convert to DataFrame, sort oldest→newest, and write Parquet
        df = pd.DataFrame(records).sort_values("date")
        outfile = Path(folder, f"{ticker}.parquet")
        df.to_parquet(outfile, index=False)

        time.sleep(DELAY_SEC)  # stay within rate limit

print("Done. Parquet files are in:")
for folder in ENDPOINTS:
    print(" ", folder)


Done. Parquet files are in:
  income_statements
  balance_sheet_statements
  cash_flow_statements


In [None]:
"""
Convert the three statement Parquets for every ticker into a single Excel
workbook (three sheets) and save it in a folder named after the ticker’s sector.

Folder layout:
    excel_by_sector/
        Basic Materials/
        Communication Services/
        Consumer Cyclical/
        Consumer Defensive/
        Energy/
        Financial Services/
        Healthcare/
        Industrials/
        Real Estate/
        Technology/
        Utilities/
"""
#!pip install xlsxwriter
import pandas as pd
from pathlib import Path

# -------------------------------------------------------------------
# 0.  Locations
# -------------------------------------------------------------------
ROOT       = Path("excel_by_sector")          # top-level output directory
PROFILES   = Path("/workspaces/Stock-Market-Prediction/FunSig/sp500_company_profiles.csv")

INCOME_DIR   = Path("income_statements")
BALANCE_DIR  = Path("balance_sheet_statements")
CASHFLOW_DIR = Path("cash_flow_statements")

# -------------------------------------------------------------------
# 1.  Sector lookup table
# -------------------------------------------------------------------
profiles_df = pd.read_csv(PROFILES, usecols=["ticker", "sector"])
sector_map  = dict(zip(profiles_df["ticker"], profiles_df["sector"]))

# Create the eleven sector folders up front
for sector in profiles_df["sector"].dropna().unique():
    (ROOT / sector).mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------------
# 2.  Determine which tickers have all three Parquet files
# -------------------------------------------------------------------
income_files   = {p.stem for p in INCOME_DIR.glob("*.parquet")}
balance_files  = {p.stem for p in BALANCE_DIR.glob("*.parquet")}
cashflow_files = {p.stem for p in CASHFLOW_DIR.glob("*.parquet")}

tickers = sorted(income_files & balance_files & cashflow_files)

# -------------------------------------------------------------------
# 3.  One Excel workbook per ticker, saved in sector folder
# -------------------------------------------------------------------
for ticker in tickers:
    sector = sector_map.get(ticker, "Unknown")
    outdir = ROOT / sector
    outdir.mkdir(parents=True, exist_ok=True)

    income_df   = pd.read_parquet(INCOME_DIR   / f"{ticker}.parquet")
    balance_df  = pd.read_parquet(BALANCE_DIR  / f"{ticker}.parquet")
    cashflow_df = pd.read_parquet(CASHFLOW_DIR / f"{ticker}.parquet")

    outfile = outdir / f"{ticker}.xlsx"

    with pd.ExcelWriter(outfile, engine="xlsxwriter") as writer:
        income_df.to_excel(writer,   sheet_name="Income Statement", index=False)
        balance_df.to_excel(writer,  sheet_name="Balance Sheet",    index=False)
        cashflow_df.to_excel(writer, sheet_name="Cash Flow",        index=False)

print(f"Finished. Workbooks are in {ROOT.resolve()}")


Collecting xlsxwriter
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Downloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Finished. Workbooks are in /workspaces/Stock-Market-Prediction/FunSig/excel_by_sector


Collect Prices for each company over time and also p/e on each day incorporating the price

In [23]:
"""
download_fmp_prices_and_pe.py
-----------------------------
For every ticker in excel_by_sector/<sector>/<ticker>.xlsx:
    • Download the full daily OHLCV+VWAP history from FMP
      (in 5-year slices to avoid the default limit).
    • Build an EPS timeline from the saved income-statement
      Parquet, shifting after-close releases to the next trading day.
    • Merge the two datasets and compute Close / EPS as daily P/E.
    • Save to <sector>/<ticker>_daily.csv alongside the workbook.
"""

from __future__ import annotations

import os
import time
import json
import requests
from datetime import datetime, date, timedelta

import pandas as pd
from pandas.tseries.offsets import BDay
from pathlib import Path

# ------------------------------------------------------------------
# 0.  Configuration
# ------------------------------------------------------------------
API_KEY = os.getenv("FMP_API_KEY")
if not API_KEY:
    raise EnvironmentError("FMP_API_KEY environment variable not set.")

ROOT              = Path("excel_by_sector")            # sector folders created earlier
INCOME_DIR        = Path("income_statements")          # ticker.parquet files
PRICE_ENDPOINT    = "https://financialmodelingprep.com/api/v3/historical-price-full/{sym}"
SEGMENT_DAYS      = 1826                               # ≤ 5 years per FMP docs
PAUSE_SEC         = 0.10                               # stay under 750 calls/min

eastern = pd.Timestamp.now(tz="US/Eastern").tz


# ------------------------------------------------------------------
# 1.  Collect all (ticker, sector_dir) pairs
# ------------------------------------------------------------------
pairs: list[tuple[str, Path]] = []
for sector_dir in ROOT.iterdir():
    if sector_dir.is_dir():
        for wb in sector_dir.glob("*.xlsx"):
            pairs.append((wb.stem, sector_dir))

# ------------------------------------------------------------------
# 2.  Utility – fetch one 5-year slice of prices
# ------------------------------------------------------------------
def fetch_slice(symbol: str, frm: date, to: date) -> list[dict]:
    url = (
        f"{PRICE_ENDPOINT.format(sym=symbol)}"
        f"?from={frm.isoformat()}&to={to.isoformat()}&apikey={API_KEY}"
    )
    try:
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        data = r.json()
        return data.get("historical", [])
    except Exception as err:
        print(f"{symbol}: price slice {frm}/{to} failed – {err}")
        return []


# ------------------------------------------------------------------
# 3.  Build full price history (loops back in 5-year blocks)
# ------------------------------------------------------------------
def full_history(symbol: str) -> pd.DataFrame:
    print(f"{symbol}: downloading daily prices from FMP")
    today = date.today()
    to_dt = today
    all_rows: list[dict] = []

    while True:
        frm_dt = to_dt - timedelta(days=SEGMENT_DAYS)
        rows = fetch_slice(symbol, frm_dt, to_dt)
        if not rows:
            break
        all_rows.extend(rows)
        oldest_dt = datetime.strptime(rows[-1]["date"], "%Y-%m-%d").date()
        if oldest_dt >= to_dt:          # should never happen, but guard
            break
        if oldest_dt <= frm_dt:
            # we’ve spanned the whole 5-year window; ask for the 5 yrs before that
            to_dt = oldest_dt - timedelta(days=1)
        else:
            # less than 5 yrs returned – we reached the beginning
            break
        time.sleep(PAUSE_SEC)

    if not all_rows:
        return pd.DataFrame()

    df = pd.DataFrame(all_rows)
    df.rename(
        columns={
            "open": "Open",
            "high": "High",
            "low": "Low",
            "close": "Close",
            "volume": "Volume",
            "vwap": "VWAP",
        },
        inplace=True,
    )
    df["Date"] = pd.to_datetime(df["date"])
    df = df[["Date", "Open", "High", "Low", "Close", "Volume", "VWAP"]]
    return df.sort_values("Date").reset_index(drop=True)
# ------------------------------------------------------------------
# 4.  Build EPS timeline from income-statement Parquet
# ------------------------------------------------------------------
def eps_timeline(ticker: str) -> pd.DataFrame:
    """
    Return a DataFrame with columns
        Date  (datetime64[ns], trading day when EPS became public)
        EPS   (float)
    or an empty DF if no income-statement file exists.
    """
    fpath = INCOME_DIR / f"{ticker}.parquet"
    if not fpath.exists():
        return pd.DataFrame(columns=["Date", "EPS"])

    stm = pd.read_parquet(fpath, columns=["acceptedDate", "eps"])
    if stm.empty:
        return pd.DataFrame(columns=["Date", "EPS"])

    # Parse SEC acceptedDate → timezone-aware US/Eastern timestamp
    ts = pd.to_datetime(stm["acceptedDate"], utc=True, errors="coerce")
    ts = ts.dt.tz_convert("US/Eastern")

    # If the filing hit after 16:00, use next trading day; else same day
    after_close = ts.dt.hour >= 16
    eff_ts = ts.where(~after_close, ts + BDay(1))            # shift next day if after close
    eff_ts = eff_ts.dt.normalize()                           # midnight local time
    eff_ts = eff_ts.dt.tz_localize(None)                     # drop tz → naive

    tl = (
        pd.DataFrame({"Date": eff_ts, "EPS": stm["eps"]})
        .dropna(subset=["Date", "EPS"])
        .sort_values("Date")
        .drop_duplicates(subset=["Date"], keep="last")
        .reset_index(drop=True)
    )
    return tl


# ------------------------------------------------------------------
# 5.  Main per-ticker routine
# ------------------------------------------------------------------
def process(ticker: str, folder: Path):
    out_csv = folder / f"{ticker}_daily.csv"
    if out_csv.exists():
        print(f"{ticker}: CSV already present – skipping")
        return

    price_df = full_history(ticker)
    if price_df.empty:
        print(f"{ticker}: no price data – skipped")
        return

    eps_df = eps_timeline(ticker)

    if eps_df.empty:
        price_df["EPS"] = pd.NA
    else:
        # merge_asof requires both Date columns to be datetime64 and sorted
        price_df = price_df.sort_values("Date")
        eps_df   = eps_df.sort_values("Date")

        price_df = pd.merge_asof(
            price_df,
            eps_df,
            on="Date",
            direction="backward"
        )

    price_df["PE"] = price_df["Close"] / price_df["EPS"]
    price_df.to_csv(out_csv, index=False)
    print(f"{ticker}: wrote {len(price_df)} rows")

    time.sleep(PAUSE_SEC)



# ------------------------------------------------------------------
# 6.  Run
# ------------------------------------------------------------------
for tk, fld in pairs:
    process(tk, fld)

print("All tickers processed – daily CSV files sit alongside the Excel workbooks.")


OXY: downloading daily prices from FMP
OXY: wrote 14442 rows
EQT: downloading daily prices from FMP
EQT: wrote 14538 rows
CTRA: downloading daily prices from FMP
CTRA: wrote 8884 rows
HAL: downloading daily prices from FMP
HAL: wrote 14441 rows
COP: downloading daily prices from FMP
COP: wrote 14441 rows
EOG: downloading daily prices from FMP
EOG: wrote 8972 rows
WMB: downloading daily prices from FMP
WMB: wrote 14441 rows
SLB: downloading daily prices from FMP
SLB: wrote 14442 rows
FSLR: downloading daily prices from FMP
FSLR: wrote 4653 rows
ENPH: downloading daily prices from FMP
ENPH: wrote 3303 rows
HES: downloading daily prices from FMP
HES: wrote 14094 rows
VLO: downloading daily prices from FMP
VLO: wrote 11439 rows
BKR: downloading daily prices from FMP
BKR: wrote 9604 rows
XOM: downloading daily prices from FMP
XOM: wrote 16075 rows
TPL: downloading daily prices from FMP
TPL: wrote 14206 rows
DVN: downloading daily prices from FMP
DVN: wrote 10036 rows
TRGP: downloading daily