In [1]:
import os
import pandas as pd
import requests
import time
import json
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error



In [5]:
# --- CONFIG ---
API_KEY = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
os.makedirs(data_folder, exist_ok=True)
tickers_csv_file = os.path.join(data_folder, "sp500_tickers.csv")

# --- LOAD FROM CACHE OR FETCH ---
if os.path.exists(tickers_csv_file):
    print("Loading tickers from CSV cache...")
    df_sp500 = pd.read_csv(tickers_csv_file)
else:
    print("Fetching tickers from API...")
    url = f"https://financialmodelingprep.com/api/v3/sp500_constituent?apikey={API_KEY}"
    df_sp500 = pd.DataFrame(requests.get(url).json())

    # Save to CSV
    df_sp500.to_csv(tickers_csv_file, index=False)
    print(f"Saved {len(df_sp500)} tickers to CSV cache.")

# --- PREVIEW ---
print(df_sp500.shape)
print(df_sp500.columns)

df_sp500.head(2)

tickers = df_sp500["symbol"].dropna().unique().tolist()

Loading tickers from CSV cache...
(503, 8)
Index(['symbol', 'name', 'sector', 'subSector', 'headQuarter',
       'dateFirstAdded', 'cik', 'founded'],
      dtype='object')


In [6]:
# Set up data folder and file paths
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
csv_file = os.path.join(data_folder, "sp500_tickers.csv")
json_file = os.path.join(data_folder, "income_statements.json")

# Define which fields to keep
desired_fields = [
    'symbol',
    'date',
    'revenue',
    'grossProfit',
    'operatingIncome',
    'netIncome',
    'eps',
    'ebitda',
    'costOfRevenue',
    'operatingExpenses'
]

# Try loading from cached JSON if it exists
if os.path.exists(json_file):
    print("Loading data from JSON cache...")
    with open(json_file, 'r') as f:
        records = json.load(f)
else:
    # Load tickers
    df = pd.read_csv(csv_file)
    tickers = df_sp500['symbol'].dropna().tolist()

    API_KEY = 'YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6'
    records = []

    # Fetch most recent income statement per ticker
    for ticker in tickers:
        url = f'https://financialmodelingprep.com/api/v3/income-statement/{ticker}?limit=1&apikey={API_KEY}'
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if data:
                row = data[0]
                row['symbol'] = ticker
                filtered_row = {k: row.get(k, None) for k in desired_fields}
                records.append(filtered_row)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {ticker}: {e}")
        time.sleep(.2)  # Rate limiting

    # Save to JSON
    with open(json_file, 'w') as f:
        json.dump(records, f, indent=2)
    print("Saved data to JSON cache.")

# Convert to DataFrame
income_df = pd.DataFrame(records)

# Show sample
print(income_df.shape)
print(income_df.columns)
income_df.head(2)

Loading data from JSON cache...
(503, 10)
Index(['symbol', 'date', 'revenue', 'grossProfit', 'operatingIncome',
       'netIncome', 'eps', 'ebitda', 'costOfRevenue', 'operatingExpenses'],
      dtype='object')


Unnamed: 0,symbol,date,revenue,grossProfit,operatingIncome,netIncome,eps,ebitda,costOfRevenue,operatingExpenses
0,XYZ,2024-12-31,24121053000,8889036000,892327000,2897047000,4.7,1345598000,15232017000,7996709000
1,TTD,2024-12-31,2444831000,1972819000,427167000,393076000,0.8,514657000,472012000,1545652000


In [None]:
def fetch_statement(endpoint, tickers, period, limit, data_folder):
    """Fetch statements with unique JSON filename based on endpoint, period, limit."""
    output_file = os.path.join(
        data_folder,
        f"{endpoint}_{period}_limit{limit}.json"
    )

    if os.path.exists(output_file):
        print(f"Loading from cache: {output_file}")
        with open(output_file, "r") as f:
            return json.load(f)

    records = []
    for ticker in tickers:
        url = f"https://financialmodelingprep.com/api/v3/{endpoint}/{ticker}?period={period}&limit={limit}&apikey={API_KEY}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if data:
                for row in data:
                    row["symbol"] = ticker
                records.extend(data)
        except Exception as e:
            print(f"Error fetching {ticker} ({endpoint}): {e}")
        time.sleep(.2)  # API polite rate limit

    with open(output_file, "w") as f:
        json.dump(records, f, indent=2)
    print(f"Saved {len(records)} records to {output_file}")
    return records


income_data   = fetch_statement("income-statement", tickers, "annual", 2, data_folder)
balance_data  = fetch_statement("balance-sheet-statement", tickers, "annual", 2, data_folder)
cashflow_data = fetch_statement("cash-flow-statement", tickers, "annual", 2, data_folder)


Loading from cache: /Users/nicholassanso/Desktop/Trading/Data/income-statement_annual_limit2.json
Saved 1006 records to /Users/nicholassanso/Desktop/Trading/Data/balance-sheet-statement_annual_limit2.json


In [None]:
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
tickers_csv_path = os.path.join(data_folder, "sp500_tickers.csv")

# Output JSONs
income_json = os.path.join(data_folder, "income_statements.json")
balance_json = os.path.join(data_folder, "balance_sheets.json")
cashflow_json = os.path.join(data_folder, "cash_flows.json")

API_KEY = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"

# -------------------------
# Helper function
# -------------------------
def fetch_statement(endpoint, tickers, output_file):
    """Fetch latest financial statement for all tickers from FMP."""
    if os.path.exists(output_file):
        print(f"Loading from cache: {output_file}")
        with open(output_file, "r") as f:
            return json.load(f)

    records = []
    for ticker in tickers:
        url = f"https://financialmodelingprep.com/api/v3/{endpoint}/{ticker}?limit=1&apikey={API_KEY}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if data:
                row = data[0]
                row["symbol"] = ticker
                records.append(row)
        except Exception as e:
            print(f"Error fetching {ticker} ({endpoint}): {e}")
        time.sleep(.2)  # API polite rate limit

    with open(output_file, "w") as f:
        json.dump(records, f, indent=2)
    print(f"Saved {len(records)} records to {output_file}")
    return records

# -------------------------
# Main
# -------------------------
tickers_df = pd.read_csv(tickers_csv_path)
tickers = tickers_df["symbol"].dropna().unique().tolist()

# Pull each type of statement
income_data   = fetch_statement("income-statement", tickers, income_json)
balance_data  = fetch_statement("balance-sheet-statement", tickers, balance_json)
cashflow_data = fetch_statement("cash-flow-statement", tickers, cashflow_json)

# Convert to DataFrames
income_df   = pd.DataFrame(income_data)
balance_df  = pd.DataFrame(balance_data)
cashflow_df = pd.DataFrame(cashflow_data)

print("Income shape:", income_df.shape)
print("Balance shape:", balance_df.shape)
print("Cash flow shape:", cashflow_df.shape)

print(income_df.columns)
print(balance_df.columns)
print(cashflow_df.columns)

In [None]:
# Define paths
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
tickers_csv_path = os.path.join(data_folder, "sp500_tickers.csv")
output_json_path = os.path.join(data_folder, "ratios.json")

# Try loading from cache
if os.path.exists(output_json_path):
    print("Loading ratios from JSON cache...")
    with open(output_json_path, "r") as f:
        all_ratios = json.load(f)
else:
    print("No cache found — fetching ratios from API...")
    tickers_df = pd.read_csv(tickers_csv_path)
    tickers = tickers_df['symbol'].dropna().unique().tolist()

    api_key = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"
    base_url = "https://financialmodelingprep.com/api/v3/ratios/{}?limit=1&apikey={}"

    all_ratios = []

    for ticker in tickers:
        try:
            url = base_url.format(ticker, api_key)
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if data:
                all_ratios.append(data[0])
            time.sleep(.2)
        except Exception as e:
            print(f"Error fetching {ticker}: {e}")

    # Save to cache
    with open(output_json_path, "w") as f:
        json.dump(all_ratios, f, indent=2)
    print(f"Saved {len(all_ratios)} records to JSON cache.")

# Convert to DataFrame
df_ratios = pd.DataFrame(all_ratios)
print(df_ratios.columns)
print(df_ratios.shape)

In [None]:
# 1. Set up paths
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
csv_file    = os.path.join(data_folder, "sp500_tickers.csv")
json_file   = os.path.join(data_folder, "company_info.json")

# 2. Define the fields you want to keep
desired_fields = [
    "symbol", "companyName", "marketCap", "sector", "industry", "beta",
    "price", "lastAnnualDividend", "volume", "exchange", "exchangeShortName",
    "country", "isEtf", "isFund"
]

# 3. Load or fetch company info
records = []

if os.path.exists(json_file):
    print("Loading company info from JSON cache...")
    with open(json_file, "r") as f:
        records = json.load(f)

else:
    print("No cache found — fetching company info from API...")
    df      = pd.read_csv(csv_file)
    tickers = df['symbol'].dropna().unique().tolist()
    print(f"Found {len(tickers)} unique tickers.")

    API_KEY  = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"
    BASE_URL = "https://financialmodelingprep.com/api/v3/profile/{}?apikey={}"

    failed = []

    for i, ticker in enumerate(tickers, 1):
        url = BASE_URL.format(ticker, API_KEY)
        try:
            resp = requests.get(url)
            resp.raise_for_status()
            data = resp.json()
            if isinstance(data, list) and data:
                row = data[0]
                filtered = {k: row.get(k, None) for k in desired_fields}
                records.append(filtered)
            else:
                print(f"  • No profile data for {ticker}")
                failed.append(ticker)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {ticker}: {e}")
            failed.append(ticker)
        time.sleep(.2)

    os.makedirs(data_folder, exist_ok=True)
    with open(json_file, "w") as f:
        json.dump(records, f, indent=2)
    print(f"Saved {len(records)} company records to cache.")
    if failed:
        print(f"{len(failed)} tickers failed to fetch: {failed[:5]}...")

# 4. Convert to DataFrame and inspect
company_df = pd.DataFrame(records)
company_df.drop_duplicates(subset="symbol", inplace=True)

print("Columns returned:", company_df.columns.tolist())
print(company_df.columns)
print(company_df.shape)


In [None]:
## --- PREP DATA ---
# Convert to numeric
df_ratios['priceEarningsRatio'] = pd.to_numeric(df_ratios['priceEarningsRatio'], errors='coerce')
df_growth['growthEPSDiluted'] = pd.to_numeric(df_growth['growthEPSDiluted'], errors='coerce')

# Merge
merged_df = pd.merge(
    df_ratios[['symbol', 'priceEarningsRatio']],
    df_growth[['symbol', 'growthEPSDiluted']],
    on='symbol',
    how='inner'
)

# Drop missing values
reg_df = merged_df.dropna(subset=['priceEarningsRatio', 'growthEPSDiluted']).copy()

# Filter out non-positive values
reg_df = reg_df[(reg_df['growthEPSDiluted'] > 0) & (reg_df['priceEarningsRatio'] > 0)]

# Log-transform P/E
reg_df['ln_PE'] = np.log(reg_df['priceEarningsRatio'])

# --- RUN REGRESSIONS AT DIFFERENT CAPS ---
results_table = []

for cap in np.arange(0.05, 1.05, 0.05):  # 5% increments
    temp_df = reg_df[reg_df['growthEPSDiluted'] <= cap]
    if len(temp_df) < 5:  # skip if not enough data
        continue
    
    X = sm.add_constant(temp_df['growthEPSDiluted'])
    y = temp_df['ln_PE']
    model = sm.OLS(y, X).fit()
    
    results_table.append({
        'cap': cap,
        'r2': model.rsquared,
        't_stat': model.tvalues['growthEPSDiluted'],
        'mse': mean_squared_error(y, model.fittedvalues),
        'model': model
    })

# Convert to DataFrame
results_df = pd.DataFrame(results_table).sort_values('mse')

# --- PRINT RESULTS ---
print(results_df[['cap', 'r2', 't_stat', 'mse']])

best_model_row = results_df.iloc[0]
print("\nBest growth cap:", best_model_row['cap'])
print(best_model_row['model'].summary())


In [None]:
# Scatterplot
plt.figure(figsize=(8,6))
plt.scatter(reg_df['growthEPSDiluted'], reg_df['ln_PE'], alpha=0.6, edgecolors='k')

# Regression line
x_vals = np.linspace(reg_df['growthEPSDiluted'].min(), reg_df['growthEPSDiluted'].max(), 5)
plt.xlim(0, .3)
y_vals = model.params['const'] + model.params['growthEPSDiluted'] * x_vals
plt.ylim(2, 5)

plt.plot(x_vals, y_vals, color='red', linewidth=2, label='OLS fit')

# Labels & title
plt.xlabel("EPS Growth (decimal form, e.g., 0.30 = 30%)", fontsize=12)
plt.ylabel("ln(P/E)", fontsize=12)
plt.title("Log(P/E) vs EPS Growth", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Non‐linear threshold model: allow different slopes below/above gamma
def threshold_model(gamma):
    reg_df['below'] = (reg_df['growthEPSDiluted'] <= gamma) * reg_df['growthEPSDiluted']
    reg_df['above'] = (reg_df['growthEPSDiluted'] > gamma) * reg_df['growthEPSDiluted']
    formula = 'ln_PE ~ below + above'
    return smf.ols(formula, data=reg_df).fit()

# Grid‐search gamma in [0.05,1.0]
gammas, rss = [], []
for g in np.linspace(0.05, 1, 20):
    fit = threshold_model(g)
    gammas.append(g); rss.append(sum(fit.resid**2))

best_gamma = gammas[np.argmin(rss)]
best_model   = threshold_model(best_gamma)
