In [16]:
import os
import glob
from datetime import datetime, timezone
from dateutil import tz

import pandas as pd
import yfinance as yf
import requests
import numpy as np

ET = tz.gettz("American/New_York")


In [17]:
folder_path = "./Raw Historical Data/."

# get all tickers
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

tickers = []

for file in all_files:
    ticker = os.path.splitext(os.path.basename(file))[0]
    tickers.append(ticker)

print(tickers)

['AAPL', 'AMD', 'AMZN', 'BA', 'BAC', 'CAT', 'GOOGL', 'GS', 'HD', 'JNJ', 'JPM', 'KO', 'META', 'MSFT', 'NFLX', 'NVDA', 'PFE', 'PG', 'SPY', 'TSLA', 'XOM']


In [18]:
def to_utc_naive(ts):
    """accepts pandas/py datetime (tz-aware or not) and returns UTC naive"""
    if ts is None or pd.isna(ts):
        return None
    ts = pd.to_datetime(ts, utc=True)
    return ts.tz_convert("UTC").tz_localize(None)

In [19]:
def infer_amc_bmo_from_ts(utc_ts):
    """Classy if BMO/AMC from a UTC timestamp by converting to ET with DST."""
    if utc_ts is None or pd.isna(utc_ts):
        return "UNKNOWN"
    # Convert to ET
    et_ts = utc_ts.replace(tzinfo=timezone.utc).astimezone(ET)
    hhmm = et_ts.hour * 60 + et_ts.minute

    if hhmm < 570:
        return "BMO"
    elif hhmm >= 960:
        return "AMC"
    else:
        return "INTRADAY"
    

In [20]:
def surprise_pct(actual, est):
    if pd.isna(actual) or pd.isna(est) or est == 0:
        return np.nan
    return (actual - est) / abs(est)

In [21]:
def et_calendar_date(utc_ts):
    """Date the market would consider 'that earnings day in ET'"""
    if utc_ts is None or pd.isna(utc_ts):
        return pd.NaT
    et_ts = utc_ts.replace(tzinfo=timezone.utc).astimezone(ET)
    return pd.Timestamp(et_ts.date())

In [29]:
def add_eps_surprise(df):
    df["eps_surprise_pct"] = np.where(
        (df["eps_consensus"].notna()) & (df["eps_consensus"] != 0),
        (df["eps_actual"] - df["eps_consensus"]) / df["eps_consensus"].abs(),
        np.nan
    )
    return df

In [23]:
def fetch_yf_events(tickers, limit=24):
    records = []
    for t in tickers:
        try:
            df = yf.Ticker(t).get_earnings_dates(limit=limit)
            if df is None or df.empty:
                continue
            df = df.reset_index()
            df["ticker"] = t

            # Drop rows that are not Earnings
            if "Event Type" in df.columns:
                df = df[df["Event Type"].str.contains("Earnings", case=False, na=False)]

            df.rename(columns={
                "Earnings Date":"report_ts",
                "Reported EPS":"eps_actual",
                "EPS Estimate":"eps_consensus"
            }, inplace=True)
            # Normalize report_ts to UTC naive
            df["report_ts"] = pd.to_datetime(df["report_ts"], utc=True).dt.tz_convert("UTC").dt.tz_localize(None)

            # Apply amc_bmo classification
            df["amc_bmo"] = df["report_ts"].apply(infer_amc_bmo_from_ts)
            
            records.append(df[["ticker","report_ts", "amc_bmo", "eps_actual","eps_consensus"]])
        except Exception as e:
            print(f"Failed {t}: {e}")
    return pd.concat(records, ignore_index=True if records else pd.DataFrame())

        

In [24]:
def fetch_nasdaq(tickers):
    records = []
    headers = {"User-Agent": "Mozilla/5.0"}
    for t in tickers:
        try:
            url = f"https://api.nasdaq.com/api/company/{t}/earnings-surprise"
            r = requests.get(url, headers=headers, timeout=10)
            data = r.json()
            if not data or "data" not in data:
                continue
            table = data["data"].get("earningsSurpriseTable", {}).get("rows", [])
            for record in table:
                records.append({
                    "ticker": t,
                    "report_ts": record.get("dateReported"),  # usually "YYYY-MM-DD"
                    "eps_actual": record.get("eps"),
                    "eps_consensus": record.get("consensusForecast"),
                    "amc_bmo": "UNKNOWN"
                })
        except Exception as e:
            print(f"Failed {t}: {e}")
    
    return pd.DataFrame(records)

In [39]:
def merge_events(yf_df, nz_df):
    # Make sure both have the same column set
    common_cols = ["ticker","report_ts","amc_bmo","eps_actual","eps_consensus"]
    yf = yf_df[common_cols].copy()
    nz = nz_df[common_cols].copy()

    # Concat with yfinance first
    merged = pd.concat([yf, nz], ignore_index=True)

    # Drop duplicates, keep first
    merged = merged.sort_values(["ticker", "report_ts"], kind="mergesort")
    merged = merged.drop_duplicates(subset=["ticker", "report_ts"], keep="first")

    # sort for readability
    merged = merged.sort_values(["ticker", "report_ts"]).reset_index(drop=True)

    # Make sure everything is the right type

    merged = merged.astype({
        "ticker": "string",
        "report_ts": "string",
        "amc_bmo": "category",
        "eps_actual": float,
        "eps_consensus": float
    })

    # clean
    merged = merged.dropna(subset=["eps_actual", "eps_consensus"]).reset_index(drop=True)

    return merged

In [40]:
test_df = fetch_nasdaq(tickers=tickers)
test_df.head()

Failed SPY: 'NoneType' object has no attribute 'get'


Unnamed: 0,ticker,report_ts,eps_actual,eps_consensus,amc_bmo
0,AAPL,7/31/2025,1.57,1.42,UNKNOWN
1,AAPL,5/1/2025,1.65,1.61,UNKNOWN
2,AAPL,1/30/2025,2.4,2.36,UNKNOWN
3,AAPL,10/31/2024,1.64,1.49,UNKNOWN
4,AMD,8/5/2025,0.27,0.28,UNKNOWN


In [45]:
test2_df = fetch_yf_events(tickers=tickers, limit=50)
test2_df.head()

Unnamed: 0,ticker,report_ts,amc_bmo,eps_actual,eps_consensus
0,AAPL,2025-05-01 20:30:00,AMC,1.65,1.63
1,AAPL,2025-01-30 21:31:00,AMC,2.4,2.35
2,AAPL,2024-10-31 20:31:00,AMC,1.64,1.6
3,AAPL,2024-08-01 20:30:00,AMC,1.4,1.35
4,AAPL,2024-05-02 20:31:00,AMC,1.53,1.5


In [46]:
test3 = merge_events(test2_df, test_df)
test3 = add_eps_surprise(test3)
print(test3.head())

print(test3.info())
test3.to_csv("events.csv")

  ticker            report_ts amc_bmo  eps_actual  eps_consensus  \
0   AAPL  2013-04-23 04:00:00     BMO        0.36           0.36   
1   AAPL  2013-07-23 04:00:00     BMO        0.27           0.26   
2   AAPL  2013-10-28 04:00:00     BMO        0.30           0.28   
3   AAPL  2014-01-27 05:00:00     BMO        0.52           0.50   
4   AAPL  2014-04-23 04:00:00     BMO        0.42           0.36   

   eps_surprise_pct  
0          0.000000  
1          0.038462  
2          0.071429  
3          0.040000  
4          0.166667  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029 entries, 0 to 1028
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   ticker            1029 non-null   string  
 1   report_ts         1029 non-null   string  
 2   amc_bmo           1029 non-null   category
 3   eps_actual        1029 non-null   float64 
 4   eps_consensus     1029 non-null   float64 
 5   eps_surprise

In [43]:
null_rows =test3[test3["eps_consensus"].isna()]
print(null_rows)

Empty DataFrame
Columns: [ticker, report_ts, amc_bmo, eps_actual, eps_consensus, eps_surprise_pct]
Index: []
