In [1]:
import os
import pandas as pd
import requests
import time
import json
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns 

from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# --- CONFIG ---
API_KEY = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
os.makedirs(data_folder, exist_ok=True)
tickers_csv_file = os.path.join(data_folder, "sp500_tickers.csv")

# --- LOAD FROM CACHE OR FETCH ---
if os.path.exists(tickers_csv_file):
    print("Loading tickers from CSV cache...")
    df_sp500 = pd.read_csv(tickers_csv_file)
else:
    print("Fetching tickers from API...")
    url = f"https://financialmodelingprep.com/api/v3/sp500_constituent?apikey={API_KEY}"
    df_sp500 = pd.DataFrame(requests.get(url).json())

    # Save to CSV
    df_sp500.to_csv(tickers_csv_file, index=False)
    print(f"Saved {len(df_sp500)} tickers to CSV cache.")

    
tickers = df_sp500["symbol"].dropna().unique().tolist()
# --- PREVIEW ---
print(df_sp500.shape)
print(df_sp500.columns)
print(df_sp500.head(5))

Loading tickers from CSV cache...
(503, 8)
Index(['symbol', 'name', 'sector', 'subSector', 'headQuarter',
       'dateFirstAdded', 'cik', 'founded'],
      dtype='object')
  symbol                  name                  sector  \
0    XYZ           Block, Inc.              Technology   
1    TTD  The Trade Desk, Inc.              Technology   
2   DDOG               Datadog              Technology   
3   COIN       Coinbase Global      Financial Services   
4   DASH              DoorDash  Communication Services   

                            subSector              headQuarter dateFirstAdded  \
0           Software - Infrastructure      Oakland, California     2025-07-23   
1              Software - Application      Ventura, California     2025-07-18   
2              Software - Application  New York City, New York     2025-07-09   
3  Financial - Data & Stock Exchanges     Wilmington, Delaware     2025-05-19   
4      Internet Content & Information        San Francisco, CA     2025-03

In [3]:
def fetch_statement(endpoint, tickers, period, limit, data_folder):
    """Fetch statements with unique JSON filename based on endpoint, period, limit."""
    output_file = os.path.join(
        data_folder,
        f"{endpoint}_{period}_limit{limit}.json"
    )

    if os.path.exists(output_file):
        print(f"Loading from cache: {output_file}")
        with open(output_file, "r") as f:
            return json.load(f)

    records = []
    for ticker in tickers:
        url = f"https://financialmodelingprep.com/api/v3/{endpoint}/{ticker}?period={period}&limit={limit}&apikey={API_KEY}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if data:
                for row in data:
                    row["symbol"] = ticker
                records.extend(data)
        except Exception as e:
            print(f"Error fetching {ticker} ({endpoint}): {e}")
        time.sleep(.2)  # API polite rate limit

    with open(output_file, "w") as f:
        json.dump(records, f, indent=2)
    print(f"Saved {len(records)} records to {output_file}")
    return records


income_data_2_years   = fetch_statement("income-statement", tickers, "annual", 2, data_folder)
balance_data_2_years  = fetch_statement("balance-sheet-statement", tickers, "annual", 2, data_folder)
cashflow_data_2_years = fetch_statement("cash-flow-statement", tickers, "annual", 2, data_folder)

# Convert to DataFrames
income_data_2_years   = pd.DataFrame(income_data_2_years)
balance_data_2_years  = pd.DataFrame(balance_data_2_years)
cashflow_data_2_years = pd.DataFrame(cashflow_data_2_years)

print("Income shape:", income_data_2_years.shape)
print("Balance shape:", balance_data_2_years.shape)
print("Cash flow shape:", cashflow_data_2_years.shape)

print(income_data_2_years.columns)
print(balance_data_2_years.columns)
print(cashflow_data_2_years.columns)

Loading from cache: /Users/nicholassanso/Desktop/Trading/Data/income-statement_annual_limit2.json
Loading from cache: /Users/nicholassanso/Desktop/Trading/Data/balance-sheet-statement_annual_limit2.json
Loading from cache: /Users/nicholassanso/Desktop/Trading/Data/cash-flow-statement_annual_limit2.json
Income shape: (1006, 38)
Balance shape: (1006, 54)
Cash flow shape: (1006, 40)
Index(['date', 'symbol', 'reportedCurrency', 'cik', 'fillingDate',
       'acceptedDate', 'calendarYear', 'period', 'revenue', 'costOfRevenue',
       'grossProfit', 'grossProfitRatio', 'researchAndDevelopmentExpenses',
       'generalAndAdministrativeExpenses', 'sellingAndMarketingExpenses',
       'sellingGeneralAndAdministrativeExpenses', 'otherExpenses',
       'operatingExpenses', 'costAndExpenses', 'interestIncome',
       'interestExpense', 'depreciationAndAmortization', 'ebitda',
       'ebitdaratio', 'operatingIncome', 'operatingIncomeRatio',
       'totalOtherIncomeExpensesNet', 'incomeBeforeTax',
  

In [7]:
def sort_by_symbol_date(df):
    return df.sort_values(["symbol", "date"])


def compute_yoy_growth(df, exclude_cols=["symbol", "date","link","finalLink"]):
    numeric_cols = df.select_dtypes(include=[float, int]).columns
    numeric_cols = [c for c in numeric_cols if c not in exclude_cols]
    
    df_growth = df.copy()
    for col in numeric_cols:
        df_growth[col + "_yoy"] = df.groupby("symbol")[col].pct_change()
    
    return df_growth


income_sorted = sort_by_symbol_date(income_data_2_years)
balance_sorted = sort_by_symbol_date(balance_data_2_years)
cashflow_sorted = sort_by_symbol_date(cashflow_data_2_years)

income_growth = compute_yoy_growth(income_sorted)
balance_growth = compute_yoy_growth(balance_sorted)
cashflow_growth = compute_yoy_growth(cashflow_sorted)

In [8]:
def tickers_no_nan_or_zero(df, exclude_cols=["symbol", "date"]):
    # Keep only numeric columns
    numeric_cols = df.select_dtypes(include=[float, int]).columns
    numeric_cols = [c for c in numeric_cols if c not in exclude_cols]
    
    # Group by ticker
    grouped = df.groupby("symbol")[numeric_cols]
    
    # Count tickers where all numeric values are non-NaN and non-zero
    valid_tickers = grouped.apply(lambda x: ((x != 0) & (~x.isna())).all(axis=1).any())
    
    # Count
    num_valid = valid_tickers.sum()
    return num_valid, valid_tickers

# Example usage
income_valid_count, income_valid_tickers = tickers_no_nan_or_zero(income_growth)
balance_valid_count, balance_valid_tickers = tickers_no_nan_or_zero(balance_growth)
cashflow_valid_count, cashflow_valid_tickers = tickers_no_nan_or_zero(cashflow_growth)

print("Income tickers with no NaN/0:", income_valid_count)
print("Balance tickers with no NaN/0:", balance_valid_count)
print("Cashflow tickers with no NaN/0:", cashflow_valid_count)


Income tickers with no NaN/0: 26
Balance tickers with no NaN/0: 0
Cashflow tickers with no NaN/0: 7


In [9]:
import pandas as pd

def count_zeros_nans_yoy(df):
    # Keep only numeric columns that end with "_yoy"
    numeric_cols = [c for c in df.select_dtypes(include=[float, int]).columns if c.endswith("_yoy")]
    
    # Count zeros and NaNs
    zero_counts = (df[numeric_cols] == 0).sum()
    nan_counts = df[numeric_cols].isna().sum()
    
    # Combine into a single DataFrame
    summary = pd.DataFrame({
        "zeros": zero_counts,
        "nans": nan_counts
    }).sort_values(by=["zeros", "nans"], ascending=True)
    
    # Force full display
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print(summary)
    
    return summary

# Example usage
print("Income 0/NaN counts per YoY column:")
income_summary = count_zeros_nans_yoy(income_growth)

print("\nBalance 0/NaN counts per YoY column:")
balance_summary = count_zeros_nans_yoy(balance_growth)

print("\nCashflow 0/NaN counts per YoY column:")
cashflow_summary = count_zeros_nans_yoy(cashflow_growth)


Income 0/NaN counts per YoY column:
                                             zeros  nans
revenue_yoy                                      0   503
grossProfit_yoy                                  0   503
costAndExpenses_yoy                              0   503
ebitdaratio_yoy                                  0   503
operatingIncome_yoy                              0   503
operatingIncomeRatio_yoy                         0   503
incomeBeforeTax_yoy                              0   503
incomeBeforeTaxRatio_yoy                         0   503
netIncomeRatio_yoy                               0   503
ebitda_yoy                                       0   504
operatingExpenses_yoy                            0   507
costOfRevenue_yoy                                0   510
totalOtherIncomeExpensesNet_yoy                  0   535
generalAndAdministrativeExpenses_yoy             0   756
netIncome_yoy                                    1   503
eps_yoy                                          1  

In [None]:
def clean_yoy_columns(df):
    """Keep only _yoy columns and drop rows with NaN or inf."""
    yoy_cols = [c for c in df.columns if "_yoy" in c]
    df_yoy = df[yoy_cols].copy()
    
    # Replace inf/-inf with NaN and drop rows with NaN
    df_yoy.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_yoy.dropna(inplace=True)
    
    return df_yoy

def run_pca_on_yoy(df_yoy, n_components=5):
    """Run PCA on cleaned YoY growth DataFrame."""
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_yoy)
    
    pca = PCA(n_components=n_components)
    pca_components = pca.fit_transform(scaled_data)
    
    pca_df = pd.DataFrame(pca_components, columns=[f"PCA{i+1}" for i in range(n_components)])
    
    return pca_df, pca.explained_variance_ratio_

# ------------------------------
# Clean the growth DataFrames
# ------------------------------
income_yoy_clean = clean_yoy_columns(income_growth)
balance_yoy_clean = clean_yoy_columns(balance_growth)
cashflow_yoy_clean = clean_yoy_columns(cashflow_growth)

# ------------------------------
# Run PCA
# ------------------------------
income_pca, income_var = run_pca_on_yoy(income_yoy_clean)
balance_pca, balance_var = run_pca_on_yoy(balance_yoy_clean)
cashflow_pca, cashflow_var = run_pca_on_yoy(cashflow_yoy_clean)

print("Income PCA explained variance:", income_var)
print("Balance PCA explained variance:", balance_var)
print("Cashflow PCA explained variance:", cashflow_var)


# Ensure DataFrame is sorted
income_data_yoy = income_data_yoy.sort_values(['symbol', 'calendarYear'])

# Define the metrics you want YoY growth for
metrics_to_grow = ['revenue', 'grossProfit', 'operatingIncome', 'netIncome', 'epsdiluted', 'ebitda']

# Loop through metrics and create YoY growth columns
for metric in metrics_to_grow:
    if metric in income_data_yoy.columns:
        income_data_yoy[f'{metric}_yoy'] = income_data_yoy.groupby('symbol')[metric].pct_change()

# Check the results
print(income_data_yoy.head(20))


In [None]:
# List of growth columns (adjust if you add more)
growth_cols = [col for col in income_data_yoy.columns if col.endswith('_yoy')]

# Make a copy of the DataFrame
income_growth_clean = income_data_yoy.copy()

# Drop rows where any of the growth columns are NaN
income_growth_clean = income_growth_clean.dropna(subset=growth_cols)

print("Original shape:", income_data_yoy.shape)
print("Cleaned shape:", income_growth_clean.shape)

# Now income_growth_clean has only rows where growth can be computed


In [None]:
# 1. Set up paths
data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
csv_file    = os.path.join(data_folder, "sp500_tickers.csv")
json_file   = os.path.join(data_folder, "company_info.json")

# 2. Define the fields you want to keep
desired_fields = [
    "symbol", "companyName", "marketCap", "sector", "industry", "beta",
    "price", "lastAnnualDividend", "volume", "exchange", "exchangeShortName",
    "country", "isEtf", "isFund"
]

# 3. Load or fetch company info
records = []

if os.path.exists(json_file):
    print("Loading company info from JSON cache...")
    with open(json_file, "r") as f:
        records = json.load(f)

else:
    print("No cache found — fetching company info from API...")
    df      = pd.read_csv(csv_file)
    tickers = df['symbol'].dropna().unique().tolist()
    print(f"Found {len(tickers)} unique tickers.")

    API_KEY  = "YwnbHRjcJvf6Md2OPoKbSRGHlzZ7hjR6"
    BASE_URL = "https://financialmodelingprep.com/api/v3/profile/{}?apikey={}"

    failed = []

    for i, ticker in enumerate(tickers, 1):
        url = BASE_URL.format(ticker, API_KEY)
        try:
            resp = requests.get(url)
            resp.raise_for_status()
            data = resp.json()
            if isinstance(data, list) and data:
                row = data[0]
                filtered = {k: row.get(k, None) for k in desired_fields}
                records.append(filtered)
            else:
                print(f"  • No profile data for {ticker}")
                failed.append(ticker)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {ticker}: {e}")
            failed.append(ticker)
        time.sleep(.2)

    os.makedirs(data_folder, exist_ok=True)
    with open(json_file, "w") as f:
        json.dump(records, f, indent=2)
    print(f"Saved {len(records)} company records to cache.")
    if failed:
        print(f"{len(failed)} tickers failed to fetch: {failed[:5]}...")

# 4. Convert to DataFrame and inspect
company_df = pd.DataFrame(records)
company_df.drop_duplicates(subset="symbol", inplace=True)

print("Columns returned:", company_df.columns.tolist())
print(company_df.columns)
print(company_df.shape)
company_df.head(10)

In [None]:
# Merge price from company_df into income_data_yoy based on 'symbol'
income_data_yoy = income_data_yoy.merge(
    company_df[['symbol', 'price']],
    on='symbol',
    how='left'
)

# Compute P/E
income_data_yoy['pe'] = income_data_yoy['price'] / income_data_yoy['epsdiluted']

# Compute natural log of P/E
income_data_yoy['log_pe'] = np.log(income_data_yoy['pe'])

# Optional: inspect
print(income_data_yoy[['symbol', 'epsdiluted', 'price', 'log_pe', 'pe']].head())


In [None]:
# Filter merged DataFrame for netIncome_yoy between 0 and 3
reg_data = pd.merge(
    income_data_yoy[['symbol', 'log_pe']], 
    income_growth_clean[['symbol', 'netIncome_yoy']], 
    on='symbol',
    how='inner'
)

In [None]:
# Keep only positive netIncome_yoy, netIncome_yoy < 3, and log_pe finite & < 5
reg_data = reg_data[(reg_data['netIncome_yoy'] > 0) & 
                    (reg_data['netIncome_yoy'] < .3) &
                    (reg_data['log_pe'].notna()) & 
                    np.isfinite(reg_data['log_pe']) &
                    (reg_data['log_pe'] < 5)]


# Convert to numeric if needed
reg_data['netIncome_yoy'] = pd.to_numeric(reg_data['netIncome_yoy'], errors='coerce')
reg_data['log_pe'] = pd.to_numeric(reg_data['log_pe'], errors='coerce')

# Drop any rows that couldn't convert
reg_data = reg_data.dropna(subset=['netIncome_yoy','log_pe'])

# Define dependent and independent variables
y = reg_data['log_pe']           # dependent
X = sm.add_constant(reg_data['netIncome_yoy'])    # independent with constant

# Fit OLS regression
model = sm.OLS(y, X).fit()

print(model.summary())

In [None]:
# Scatter + regression line
sns.scatterplot(x='netIncome_yoy', y='log_pe', alpha= 0.5, data=reg_data)
sns.lineplot(x=reg_data['netIncome_yoy'], y=model.predict(X), color='red')
plt.xlabel('Net Income Growth')
plt.ylabel('Log P/E')
plt.title('Log P/E vs Net Income Growth (0 < Net Income Growth < 3)')
plt.show()

# Density plot of residuals
sns.kdeplot(model.resid, fill=True)
plt.xlabel('Residuals')
plt.title('Density Plot of Regression Residuals')
plt.show()


In [None]:
# Merge the log P/E column with the net income YoY growth
reg_data = pd.merge(
    income_data_yoy[['symbol', 'log_pe']],        # log P/E from income_data_yoy
    income_growth_clean[['symbol', 'revenue_yoy']],  # YoY growth from income_growth_clean
    on='symbol',                                  # merge on the 'symbol' column
    how='inner'                                   # keep only symbols present in both
)

# Optional: filter netIncome_yoy between 0 and 3
reg_data = reg_data[(reg_data['revenue_yoy'] > 0) & (reg_data['revenue_yoy'] < 3)]


# Keep only positive revenue_yoy and finite log_pe
reg_data_rev = reg_data[(reg_data['revenue_yoy'] > 0) & 
                        (reg_data['revenue_yoy'] < .15) &
                        (reg_data['log_pe'].notna()) & 
                        np.isfinite(reg_data['log_pe'])]

# Convert to numeric if needed
reg_data_rev['revenue_yoy'] = pd.to_numeric(reg_data_rev['revenue_yoy'], errors='coerce')
reg_data_rev['log_pe'] = pd.to_numeric(reg_data_rev['log_pe'], errors='coerce')

# Drop any rows that couldn't convert
reg_data_rev = reg_data_rev.dropna(subset=['revenue_yoy','log_pe'])

# Define dependent and independent variables
y_rev = reg_data_rev['log_pe']           # dependent
X_rev = sm.add_constant(reg_data_rev['revenue_yoy'])    # independent with constant

# Fit OLS regression
model_rev = sm.OLS(y_rev, X_rev).fit()

# Regression summary
print(model_rev.summary())

# Scatter + regression line
sns.scatterplot(x='revenue_yoy', y='log_pe', data=reg_data_rev, alpha=0.5)
sns.lineplot(x=reg_data_rev['revenue_yoy'], y=model_rev.predict(X_rev), color='red', label='Regression line')
plt.xlabel('Revenue Growth')
plt.ylabel('Log P/E')
plt.title('Log P/E vs Revenue Growth (0 < Revenue Growth < .15)')
plt.legend()
plt.show()

# Density plot of residuals
sns.kdeplot(model_rev.resid, fill=True)
plt.xlabel('Residuals')
plt.title('Density Plot of Regression Residuals (Revenue Growth)')
plt.show()
