In [1]:
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

In [3]:
tables = pd.read_html(url)

In [4]:
sp500_df = tables[0]

In [5]:
sp500_df['year_added'] = pd.to_datetime(sp500_df['Date added']).dt.year

In [6]:
sp500_df['year_added'].value_counts()

year_added
1957    53
2017    23
2016    23
2019    22
2008    17
2024    16
2022    16
2021    15
2023    15
2018    14
2015    14
1997    14
2012    14
2002    13
2007    12
2020    12
2009    12
1976    11
1998    11
2006    10
2011    10
2013    10
2010     9
1999     9
2000     9
2001     8
2014     8
1985     7
1994     7
1995     7
2005     7
2004     6
1989     5
2025     5
1982     5
2003     5
1984     5
1988     4
1993     3
1980     3
1992     3
1986     3
1983     3
1981     3
1996     2
1973     2
1987     2
1972     2
1969     2
1979     2
1965     2
1975     2
1970     2
1991     1
1974     1
1964     1
1978     1
Name: count, dtype: int64

In [7]:
import yfinance as yf

In [8]:
def calculate_ytd_performance(ticker, start_date, end_date):
    try:
        data = yf.download(ticker, start=start_date, end=end_date)

        if data.empty:
            print(f"No data found for {ticker} between {start_date} and {end_date}. Skipping.")
            return None
        start_price = data['Close'].iloc[0].item()
        end_price = data['Close'].iloc[-1].item()

        performance = ((end_price - start_price) / start_price) * 100
        return performance

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        return None

In [9]:
calculate_ytd_performance('^GSPC', '2025-01-01', '2025-05-02')

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


-4.505536747043588

In [10]:
start_date='2025-01-01'
end_date='2025-05-01'

In [11]:
major_indices = {
    'S&P 500 (US)': '^GSPC',
    'Shanghai Composite (China)': '000001.SS',
    'Hang Seng Index (Hong Kong)': '^HSI',
    'S&P/ASX 200 (Australia)': '^AXJO',
    'BSE SENSEX (India)': '^BSESN',
    'TSX Composite (Canada)': '^GSPTSE',
    'DAX (Germany)': '^GDAXI',
    'FTSE 100 (UK)': '^FTSE',
    'Nikkei 225 (Japan)': '^N225',
    'IPC Mexico': '^MXX',
    'Ibovespa (Brazil)': '^BVSP'
}

In [12]:
results = {index_name: calculate_ytd_performance(ticker, start_date, end_date) for index_name, ticker in major_indices.items()}

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [13]:
performance_df = pd.DataFrame.from_dict(results, orient='index', columns=['YTD Performance (%)'])
performance_df = performance_df.sort_values(by='YTD Performance (%)', ascending=False)

In [14]:
performance_df

Unnamed: 0,YTD Performance (%)
IPC Mexico,13.049444
Hang Seng Index (Hong Kong),12.720018
Ibovespa (Brazil),12.43871
DAX (Germany),12.346378
FTSE 100 (UK),2.84259
BSE SENSEX (India),2.209774
Shanghai Composite (China),0.504817
TSX Composite (Canada),-0.226126
S&P/ASX 200 (Australia),-0.9145
S&P 500 (US),-5.103301


In [15]:
results_3y = {index_name: calculate_ytd_performance(ticker, '2022-05-01', '2025-05-01') for index_name, ticker in major_indices.items()}
performance_df_3y = pd.DataFrame.from_dict(results_3y, orient='index', columns=['YTD Performance (%)'])
performance_df_3y = performance_df_3y.sort_values(by='YTD Performance (%)', ascending=False)
performance_df_3y

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,YTD Performance (%)
DAX (Germany),61.395129
BSE SENSEX (India),40.835191
Nikkei 225 (Japan),34.404756
S&P 500 (US),34.02048
Ibovespa (Brazil),26.658164
TSX Composite (Canada),20.053451
FTSE 100 (UK),12.347091
S&P/ASX 200 (Australia),10.605692
IPC Mexico,8.425565
Shanghai Composite (China),6.886816


In [16]:
results_5y = {index_name: calculate_ytd_performance(ticker, '2020-05-01', '2025-05-01') for index_name, ticker in major_indices.items()}
performance_df_5y = pd.DataFrame.from_dict(results_5y, orient='index', columns=['YTD Performance (%)'])
performance_df_5y = performance_df_5y.sort_values(by='YTD Performance (%)', ascending=False)
performance_df_5y

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,YTD Performance (%)
BSE SENSEX (India),153.007592
DAX (Germany),114.93657
S&P 500 (US),96.737219
Nikkei 225 (Japan),83.723618
Ibovespa (Brazil),71.239667
TSX Composite (Canada),69.912379
S&P/ASX 200 (Australia),54.905743
IPC Mexico,54.684126
FTSE 100 (UK),47.401576
Shanghai Composite (China),13.928827


In [17]:
from datetime import datetime

In [18]:
sp500_df = yf.download('^GSPC', start='1950-01-01', end=datetime.now().strftime('%Y-%m-%d'), interval='1d')

[*********************100%***********************]  1 of 1 completed


In [19]:
sp500_df.columns = sp500_df.columns.get_level_values(0)

In [20]:
correction_threshold = 0.05

In [43]:
prices = sp500_df['Close'].dropna()

all_time_highs = prices[prices.cummax() == prices]

In [52]:
high_dates = all_time_highs.index
results = []

for i in range(1, len(high_dates)):
    start_date = high_dates[i - 1]
    end_date = high_dates[i]
    segment = prices[start_date:end_date]
    
    if segment.empty:
        continue

    # Step 4: Minimum price in this segment
    low_price = segment.min()
    low_date = segment.idxmin()
    high_price = prices[start_date]

    # Step 5: Calculate drawdown %
    drawdown_pct = (high_price - low_price) / high_price * 100

    # Step 6: Filter for corrections >= 5%
    if drawdown_pct >= 5:
        duration = (low_date - start_date).days
        results.append({
            'start_date': start_date,
            'end_date': end_date,
            'lowest_date': low_date,
            'duration_days': duration,
            'drawdonw_pct': drawdown_pct,
            'high_price': high_price,
            'low_price': low_price
        })

result_df = pd.DataFrame(results)

In [53]:
result_df.sort_values(by='drawdonw_pct', ascending=False)

Unnamed: 0,start_date,end_date,lowest_date,duration_days,drawdonw_pct,high_price,low_price
56,2007-10-09,2013-03-28,2009-03-09,517,56.775388,1565.150024,676.530029
54,2000-03-24,2007-05-30,2002-10-09,929,49.146948,1527.459961,776.760010
24,1973-01-11,1980-07-17,1974-10-03,630,48.203593,120.239998,62.279999
22,1968-11-29,1972-03-06,1970-05-26,543,36.061641,108.370003,69.290001
65,2020-02-19,2020-08-18,2020-03-23,33,33.924960,3386.149902,2237.399902
...,...,...,...,...,...,...,...
25,1980-09-22,1980-10-06,1980-09-29,7,5.260731,130.399994,123.540001
67,2021-09-02,2021-10-21,2021-10-04,32,5.212538,4536.950195,4300.459961
23,1972-08-14,1972-11-01,1972-10-16,63,5.135501,112.550003,106.769997
50,1999-04-12,1999-04-22,1999-04-19,7,5.089688,1358.630005,1289.479980


In [54]:
result_df['duration_days'].quantile(0.50)

np.float64(39.0)

In [55]:
result_df['duration_days'].quantile(0.25)

np.float64(21.5)

In [56]:
result_df['duration_days'].quantile(0.75)

np.float64(89.0)

In [104]:
from dateutil import parser, tz

In [141]:
earnings_df = pd.read_csv('data/ha1_Amazon.csv', delimiter=';')
earnings_df

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
0,AMZN,Amazon.com Inc,"April 29, 2026 at 6 AM EDT",-,-,-
1,AMZN,Amazon.com Inc,"February 4, 2026 at 4 PM EST",-,-,-
2,AMZN,Amazon.com Inc,"October 29, 2025 at 6 AM EDT",-,-,-
3,AMZN,Amazon.com Inc,"July 30, 2025 at 4 PM EDT",-,-,-
4,AMZN,"Amazon.com, Inc.","May 1, 2025 at 4 PM EDT",???.36,???.59,+16.74
...,...,...,...,...,...,...
112,AMZN,"Amazon.com, Inc.","April 27, 1998 at 12 AM EDT",-,-,+13.92
113,AMZN,"Amazon.com, Inc.","January 22, 1998 at 12 AM EST",-,-,+11.41
114,AMZN,"Amazon.com, Inc.","October 27, 1997 at 12 AM EST",-,-,+13.29
115,AMZN,"Amazon.com, Inc.","July 10, 1997 at 12 AM EDT",-,-,+13.33


In [142]:
earnings_df['EPS Estimate'] = pd.to_numeric(earnings_df['EPS Estimate'], errors='coerce').astype(float)
earnings_df['Reported EPS'] = pd.to_numeric(earnings_df['Reported EPS'], errors='coerce').astype(float)
earnings_df['Surprise (%)'] = pd.to_numeric(earnings_df['Surprise (%)'], errors='coerce').astype(float)

In [143]:
tzinfos = {
    'EDT': tz.gettz('America/New_York'),
    'EST': tz.gettz('America/New_York'),
}

earnings_df['Earnings Date'] = earnings_df['Earnings Date'].apply(
    lambda x: parser.parse(str(x), tzinfos=tzinfos) if pd.notnull(x) else pd.NaT
)

In [144]:
ticker = yf.Ticker('AMZN')
price_df = ticker.history(period='max')
price_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1997-05-15 00:00:00-04:00,0.121875,0.125000,0.096354,0.097917,1443120000,0.0,0.0
1997-05-16 00:00:00-04:00,0.098438,0.098958,0.085417,0.086458,294000000,0.0,0.0
1997-05-19 00:00:00-04:00,0.088021,0.088542,0.081250,0.085417,122136000,0.0,0.0
1997-05-20 00:00:00-04:00,0.086458,0.087500,0.081771,0.081771,109344000,0.0,0.0
1997-05-21 00:00:00-04:00,0.081771,0.082292,0.068750,0.071354,377064000,0.0,0.0
...,...,...,...,...,...,...,...
2025-05-23 00:00:00-04:00,198.899994,202.369995,197.850006,200.990005,33393500,0.0,0.0
2025-05-27 00:00:00-04:00,203.089996,206.690002,202.190002,206.020004,34892000,0.0,0.0
2025-05-28 00:00:00-04:00,205.919998,207.660004,204.410004,204.720001,28549800,0.0,0.0
2025-05-29 00:00:00-04:00,208.029999,208.809998,204.229996,205.699997,34650000,0.0,0.0


In [145]:
price_df['2day_return'] = price_df['Close'].shift(-2) / price_df['Close'] - 1
price_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,2day_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1997-05-15 00:00:00-04:00,0.121875,0.125000,0.096354,0.097917,1443120000,0.0,0.0,-0.127659
1997-05-16 00:00:00-04:00,0.098438,0.098958,0.085417,0.086458,294000000,0.0,0.0,-0.054211
1997-05-19 00:00:00-04:00,0.088021,0.088542,0.081250,0.085417,122136000,0.0,0.0,-0.164639
1997-05-20 00:00:00-04:00,0.086458,0.087500,0.081771,0.081771,109344000,0.0,0.0,-0.146494
1997-05-21 00:00:00-04:00,0.081771,0.082292,0.068750,0.071354,377064000,0.0,0.0,0.051097
...,...,...,...,...,...,...,...,...
2025-05-23 00:00:00-04:00,198.899994,202.369995,197.850006,200.990005,33393500,0.0,0.0,0.018558
2025-05-27 00:00:00-04:00,203.089996,206.690002,202.190002,206.020004,34892000,0.0,0.0,-0.001553
2025-05-28 00:00:00-04:00,205.919998,207.660004,204.410004,204.720001,28549800,0.0,0.0,0.001417
2025-05-29 00:00:00-04:00,208.029999,208.809998,204.229996,205.699997,34650000,0.0,0.0,


In [146]:
earnings_df['Surprise_Positive'] = (
    (earnings_df['Reported EPS'] > earnings_df['EPS Estimate']) |
    (earnings_df.get('Surprise (%)', 0) > 0)
)
earnings_df

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%),Surprise_Positive
0,AMZN,Amazon.com Inc,2026-04-29 06:00:00-04:00,,,,False
1,AMZN,Amazon.com Inc,2026-02-04 16:00:00-05:00,,,,False
2,AMZN,Amazon.com Inc,2025-10-29 06:00:00-04:00,,,,False
3,AMZN,Amazon.com Inc,2025-07-30 16:00:00-04:00,,,,False
4,AMZN,"Amazon.com, Inc.",2025-05-01 16:00:00-04:00,,,16.74,True
...,...,...,...,...,...,...,...
112,AMZN,"Amazon.com, Inc.",1998-04-27 00:00:00-04:00,,,13.92,True
113,AMZN,"Amazon.com, Inc.",1998-01-22 00:00:00-05:00,,,11.41,True
114,AMZN,"Amazon.com, Inc.",1997-10-27 00:00:00-05:00,,,13.29,True
115,AMZN,"Amazon.com, Inc.",1997-07-10 00:00:00-04:00,,,13.33,True


In [148]:
returns_on_surprise = []
price_dates = price_df.index

for _, row in earnings_df[earnings_df['Surprise_Positive']].iterrows():
    earnings_date = row['Earnings Date']
    trading_days = price_dates[(price_dates >= earnings_date - timedelta(days=1)) &
        (price_dates <= earnings_date + timedelta(days=1))]
    if len(trading_days) == 0:
        continue
    trading_day = trading_days[0]

    return_value = price_df.loc[trading_day, '2day_return']
    if pd.notna(return_value):
        returns_on_surprise.append(return_value)


In [149]:
median_return_percent = np.median(returns_on_surprise) * 100
median_return_percent

np.float64(1.5743394070905303)

In [168]:
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def load_earnings_data(filename):
    """
    Load earnings data from CSV file with semicolon delimiter
    """
    print("Step 1: Loading earnings data...")
    try:
        # Load with semicolon delimiter as specified
        earnings_df = pd.read_csv(filename, delimiter=';')
        print(f"Loaded {len(earnings_df)} earnings records")
        print(f"Columns: {list(earnings_df.columns)}")
        
        # Convert Date column to datetime
        tzinfos = {
            'EDT': tz.gettz('America/New_York'),
            'EST': tz.gettz('America/New_York'),
        }
        
        earnings_df['Earnings Date'] = earnings_df['Earnings Date'].apply(
            lambda x: parser.parse(str(x), tzinfos=tzinfos) if pd.notnull(x) else pd.NaT
        )
        
        return earnings_df
    except Exception as e:
        print(f"Error loading earnings data: {e}")
        return None

def download_price_data(symbol='AMZN', start_date='2010-01-01'):
    """
    Download complete historical price data using yfinance
    """
    print("Step 2: Downloading historical price data...")
    try:
        # Download data from start_date to current date
        ticker = yf.Ticker(symbol)
        price_df = ticker.history(period='max')
        
        # Reset index to make Date a column
        price_df = price_df.reset_index()
        
        print(f"Downloaded {len(price_df)} price data points")
        print(f"Date range: {price_df['Date'].min()} to {price_df['Date'].max()}")
        
        return price_df
    except Exception as e:
        print(f"Error downloading price data: {e}")
        return None

def calculate_2day_returns(price_df):
    """
    Calculate 2-day percentage changes for all historical dates
    For each sequence of 3 consecutive trading days (Day 1, Day 2, Day 3),
    compute return as Close_Day3 / Close_Day1 - 1
    """
    print("Step 3: Calculating 2-day percentage changes...")
    
    # Sort by date to ensure proper order
    price_df = price_df.sort_values('Date').reset_index(drop=True)
    
    returns_data = []
    
    # Calculate 2-day returns for each sequence of 3 consecutive days
    for i in range(len(price_df) - 2):
        day1_close = price_df.loc[i, 'Close']
        day3_close = price_df.loc[i + 2, 'Close']
        
        # Calculate return as (Close_Day3 / Close_Day1) - 1
        return_2day = (day3_close / day1_close) - 1
        
        returns_data.append({
            'Day1_Date': price_df.loc[i, 'Date'],
            'Day2_Date': price_df.loc[i + 1, 'Date'],  # Earnings announcement day
            'Day3_Date': price_df.loc[i + 2, 'Date'],
            'Day1_Close': day1_close,
            'Day3_Close': day3_close,
            'Return_2Day': return_2day,
            'Return_2Day_Percent': return_2day * 100.0
        })
    
    returns_df = pd.DataFrame(returns_data)
    print(f"Calculated {len(returns_df)} 2-day returns")
    
    return returns_df

def identify_positive_surprises(earnings_df):
    """
    Identify positive earnings surprises where:
    - actual EPS > estimated EPS OR
    - Surprise (%) > 0
    """
    print("Step 4: Identifying positive earnings surprises...")
    
    # Convert relevant columns to numeric, handling any non-numeric values
    earnings_df['Reported EPS'] = pd.to_numeric(earnings_df['Reported EPS'], errors='coerce').astype(float)
    earnings_df['EPS Estimate'] = pd.to_numeric(earnings_df['EPS Estimate'], errors='coerce').astype(float)
    earnings_df['Surprise (%)'] = pd.to_numeric(earnings_df['Surprise (%)'], errors='coerce').astype(float)
    
    # Identify positive surprises
    condition1 = earnings_df['Reported EPS'] > earnings_df['EPS Estimate']
    condition2 = earnings_df['Surprise (%)'] > 0
    
    positive_surprises = earnings_df[condition1 | condition2].copy()
    
    print(f"Found {len(positive_surprises)} positive earnings surprises out of {len(earnings_df)} total")
    
    return positive_surprises

def calculate_returns_following_earnings(earnings_df, price_df):
    """
    Calculate 2-day percentage changes FOLLOWING positive earnings surprises
    For each earnings date, find the 2-day return starting from that date:
    Return = Close(earnings_date + 2 days) / Close(earnings_date) - 1
    """
    print("Step 5: Calculating 2-day returns following earnings announcements...")
    
    # Sort price data by date
    price_df = price_df.sort_values('Date').reset_index(drop=True)
    
    matched_data = []
    
    for _, earnings_row in earnings_df.iterrows():
        earnings_date = earnings_row['Earnings Date']
        
        # Find the price on earnings date (or closest trading day)
        earnings_price_idx = None
        min_diff = float('inf')
        
        for i, price_date in enumerate(price_df['Date']):
            diff = abs((price_date - earnings_date).days)
            if diff < min_diff:
                min_diff = diff
                earnings_price_idx = i
        
        # Check if we found a reasonable match (within 3 days)
        if earnings_price_idx is not None and min_diff <= 3:
            # Find price 2 trading days later
            if earnings_price_idx + 2 < len(price_df):
                start_price = price_df.loc[earnings_price_idx, 'Close']
                end_price = price_df.loc[earnings_price_idx + 2, 'Close']
                start_date = price_df.loc[earnings_price_idx, 'Date']
                end_date = price_df.loc[earnings_price_idx + 2, 'Date']
                
                # Calculate 2-day return following earnings
                return_2day = (end_price / start_price) - 1
                return_2day_percent = return_2day * 100.0
                
                matched_data.append({
                    'Earnings_Date': earnings_date,
                    'Actual_EPS': earnings_row['Reported EPS'],
                    'Estimated_EPS': earnings_row['EPS Estimate'],
                    'Surprise_Percent': earnings_row['Surprise (%)'],
                    'Start_Date': start_date,
                    'End_Date': end_date,
                    'Start_Price': start_price,
                    'End_Price': end_price,
                    'Return_2Day_Percent': return_2day_percent
                })
    
    matched_df = pd.DataFrame(matched_data)
    print(f"Successfully calculated 2-day returns following {len(matched_df)} earnings announcements")
    
    return matched_df


def calculate_median_return(matched_df):
    """
    Calculate median 2-day percentage change following positive earnings surprises
    """
    print("Step 6: Calculating median 2-day percentage change...")
    
    if matched_df.empty:
        print("No matched data available for median calculation")
        return None
    
    returns = matched_df['Return_2Day_Percent'].values
    median_return = np.median(returns)
    
    print(f"\n=== RESULTS ===")
    print(f"Number of positive earnings surprises with matched returns: {len(returns)}")
    print(f"Median 2-day percentage change: {median_return:.2f}%")
    
    # Additional statistics
    print(f"\nAdditional Statistics:")
    print(f"Mean return: {np.mean(returns):.2f}%")
    print(f"Standard deviation: {np.std(returns):.2f}%")
    print(f"Min return: {np.min(returns):.2f}%")
    print(f"Max return: {np.max(returns):.2f}%")
    print(f"25th percentile: {np.percentile(returns, 25):.2f}%")
    print(f"75th percentile: {np.percentile(returns, 75):.2f}%")
    
    return median_return, matched_df


In [170]:

# Step 1: Load earnings data
earnings_df = load_earnings_data("data/ha1_Amazon.csv")

# Step 2: Download price data
price_df = download_price_data('AMZN')

# Step 3: Calculate 2-day returns
returns_df = calculate_2day_returns(price_df)

# Step 4: Identify positive surprises
positive_surprises_df = identify_positive_surprises(earnings_df)

# Step 5: Match earnings with returns
matched_df = match_earnings_with_returns(positive_surprises_df, returns_df)

# Step 6: Calculate median return
if not matched_df.empty:
    median_return, final_df = calculate_median_return(matched_df)
    
    # Display sample of the matched data
    print(f"\nSample of matched positive surprises:")
    print(final_df[['Earnings_Date', 'Actual_EPS', 'Estimated_EPS', 
                   'Surprise_Percent', 'Return_2Day_Percent']].head(10))
    
    print(median_return)
else:
    print("No matching data found!")



Step 1: Loading earnings data...
Loaded 117 earnings records
Columns: ['Symbol', 'Company', 'Earnings Date', 'EPS Estimate', 'Reported EPS', 'Surprise (%)']
Step 2: Downloading historical price data...
Downloaded 7056 price data points
Date range: 1997-05-15 00:00:00-04:00 to 2025-06-02 00:00:00-04:00
Step 3: Calculating 2-day percentage changes...
Calculated 7054 2-day returns
Step 4: Identifying positive earnings surprises...
Found 86 positive earnings surprises out of 117 total
Step 5: Matching earnings with returns...
Successfully matched 86 earnings announcements with price returns
Step 6: Calculating median 2-day percentage change...

=== RESULTS ===
Number of positive earnings surprises with matched returns: 86
Median 2-day percentage change: 0.10%

Additional Statistics:
Mean return: 0.95%
Standard deviation: 5.42%
Min return: -6.99%
Max return: 29.36%
25th percentile: -1.83%
75th percentile: 2.14%

Sample of matched positive surprises:
              Earnings_Date  Actual_EPS  

In [155]:
returns_df

Unnamed: 0,Day1_Date,Day2_Date,Day3_Date,Day1_Close,Day3_Close,Return_2Day,Return_2Day_Percent
0,1997-05-15 00:00:00-04:00,1997-05-16 00:00:00-04:00,1997-05-19 00:00:00-04:00,0.097917,0.085417,-0.127659,-12.765910
1,1997-05-16 00:00:00-04:00,1997-05-19 00:00:00-04:00,1997-05-20 00:00:00-04:00,0.086458,0.081771,-0.054211,-5.421125
2,1997-05-19 00:00:00-04:00,1997-05-20 00:00:00-04:00,1997-05-21 00:00:00-04:00,0.085417,0.071354,-0.164639,-16.463936
3,1997-05-20 00:00:00-04:00,1997-05-21 00:00:00-04:00,1997-05-22 00:00:00-04:00,0.081771,0.069792,-0.146494,-14.649446
4,1997-05-21 00:00:00-04:00,1997-05-22 00:00:00-04:00,1997-05-23 00:00:00-04:00,0.071354,0.075000,0.051097,5.109736
...,...,...,...,...,...,...,...
7049,2025-05-22 00:00:00-04:00,2025-05-23 00:00:00-04:00,2025-05-27 00:00:00-04:00,203.100006,206.020004,0.014377,1.437714
7050,2025-05-23 00:00:00-04:00,2025-05-27 00:00:00-04:00,2025-05-28 00:00:00-04:00,200.990005,204.720001,0.018558,1.855812
7051,2025-05-27 00:00:00-04:00,2025-05-28 00:00:00-04:00,2025-05-29 00:00:00-04:00,206.020004,205.699997,-0.001553,-0.155328
7052,2025-05-28 00:00:00-04:00,2025-05-29 00:00:00-04:00,2025-05-30 00:00:00-04:00,204.720001,205.009995,0.001417,0.141654


In [156]:
price_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1997-05-15 00:00:00-04:00,0.121875,0.125000,0.096354,0.097917,1443120000,0.0,0.0
1,1997-05-16 00:00:00-04:00,0.098438,0.098958,0.085417,0.086458,294000000,0.0,0.0
2,1997-05-19 00:00:00-04:00,0.088021,0.088542,0.081250,0.085417,122136000,0.0,0.0
3,1997-05-20 00:00:00-04:00,0.086458,0.087500,0.081771,0.081771,109344000,0.0,0.0
4,1997-05-21 00:00:00-04:00,0.081771,0.082292,0.068750,0.071354,377064000,0.0,0.0
...,...,...,...,...,...,...,...,...
7051,2025-05-27 00:00:00-04:00,203.089996,206.690002,202.190002,206.020004,34892000,0.0,0.0
7052,2025-05-28 00:00:00-04:00,205.919998,207.660004,204.410004,204.720001,28549800,0.0,0.0
7053,2025-05-29 00:00:00-04:00,208.029999,208.809998,204.229996,205.699997,34650000,0.0,0.0
7054,2025-05-30 00:00:00-04:00,204.839996,205.990005,201.699997,205.009995,51649600,0.0,0.0


In [157]:
earnings_df

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
0,AMZN,Amazon.com Inc,2026-04-29 06:00:00-04:00,,,
1,AMZN,Amazon.com Inc,2026-02-04 16:00:00-05:00,,,
2,AMZN,Amazon.com Inc,2025-10-29 06:00:00-04:00,,,
3,AMZN,Amazon.com Inc,2025-07-30 16:00:00-04:00,,,
4,AMZN,"Amazon.com, Inc.",2025-05-01 16:00:00-04:00,,,16.74
...,...,...,...,...,...,...
112,AMZN,"Amazon.com, Inc.",1998-04-27 00:00:00-04:00,,,13.92
113,AMZN,"Amazon.com, Inc.",1998-01-22 00:00:00-05:00,,,11.41
114,AMZN,"Amazon.com, Inc.",1997-10-27 00:00:00-05:00,,,13.29
115,AMZN,"Amazon.com, Inc.",1997-07-10 00:00:00-04:00,,,13.33


In [158]:
positive_surprises_df

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
4,AMZN,"Amazon.com, Inc.",2025-05-01 16:00:00-04:00,,,16.74
5,AMZN,"Amazon.com, Inc.",2025-02-06 16:00:00-05:00,,,24.47
6,AMZN,"Amazon.com, Inc.",2024-10-31 16:00:00-04:00,,,25.17
7,AMZN,"Amazon.com, Inc.",2024-08-01 16:00:00-04:00,,,22.58
8,AMZN,"Amazon.com, Inc.",2024-04-30 16:00:00-04:00,0.83,0.98,17.91
...,...,...,...,...,...,...
111,AMZN,"Amazon.com, Inc.",1998-07-22 00:00:00-04:00,,,1.34
112,AMZN,"Amazon.com, Inc.",1998-04-27 00:00:00-04:00,,,13.92
113,AMZN,"Amazon.com, Inc.",1998-01-22 00:00:00-05:00,,,11.41
114,AMZN,"Amazon.com, Inc.",1997-10-27 00:00:00-05:00,,,13.29


In [161]:
matched_df

Unnamed: 0,Earnings_Date,Actual_EPS,Estimated_EPS,Surprise_Percent,Day1_Date,Day2_Date,Day3_Date,Return_2Day_Percent
0,2025-05-01 16:00:00-04:00,,,16.74,2025-05-01 00:00:00-04:00,2025-05-02 00:00:00-04:00,2025-05-05 00:00:00-04:00,-2.024180
1,2025-02-06 16:00:00-05:00,,,24.47,2025-02-06 00:00:00-05:00,2025-02-07 00:00:00-05:00,2025-02-10 00:00:00-05:00,-2.382449
2,2024-10-31 16:00:00-04:00,,,25.17,2024-10-31 00:00:00-04:00,2024-11-01 00:00:00-04:00,2024-11-04 00:00:00-05:00,5.032192
3,2024-08-01 16:00:00-04:00,,,22.58,2024-08-01 00:00:00-04:00,2024-08-02 00:00:00-04:00,2024-08-05 00:00:00-04:00,-12.522411
4,2024-04-30 16:00:00-04:00,0.98,0.83,17.91,2024-04-30 00:00:00-04:00,2024-05-01 00:00:00-04:00,2024-05-02 00:00:00-04:00,5.554286
...,...,...,...,...,...,...,...,...
81,1998-07-22 00:00:00-04:00,,,1.34,1998-07-21 00:00:00-04:00,1998-07-22 00:00:00-04:00,1998-07-23 00:00:00-04:00,-3.360132
82,1998-04-27 00:00:00-04:00,,,13.92,1998-04-24 00:00:00-04:00,1998-04-27 00:00:00-04:00,1998-04-28 00:00:00-04:00,12.665772
83,1998-01-22 00:00:00-05:00,,,11.41,1998-01-21 00:00:00-05:00,1998-01-22 00:00:00-05:00,1998-01-23 00:00:00-05:00,-2.669462
84,1997-10-27 00:00:00-05:00,,,13.29,1997-10-24 00:00:00-04:00,1997-10-27 00:00:00-05:00,1997-10-28 00:00:00-05:00,-1.554304


In [None]:
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def load_earnings_data(filename):
    """
    Load earnings data from CSV file with semicolon delimiter
    """
    print("Step 1: Loading earnings data...")
    try:
        # Load with semicolon delimiter as specified
        earnings_df = pd.read_csv(filename, delimiter=';')
        print(f"Loaded {len(earnings_df)} earnings records")
        print(f"Columns: {list(earnings_df.columns)}")
        
        # Convert Date column to datetime
        earnings_df['Date'] = pd.to_datetime(earnings_df['Date'])
        
        return earnings_df
    except Exception as e:
        print(f"Error loading earnings data: {e}")
        return None

def download_price_data(symbol='AMZN', start_date='2010-01-01'):
    """
    Download complete historical price data using yfinance
    """
    print("Step 2: Downloading historical price data...")
    try:
        # Download data from start_date to current date
        ticker = yf.Ticker(symbol)
        price_df = ticker.history(start=start_date, end=datetime.now().strftime('%Y-%m-%d'))
        
        # Reset index to make Date a column
        price_df = price_df.reset_index()
        
        print(f"Downloaded {len(price_df)} price data points")
        print(f"Date range: {price_df['Date'].min()} to {price_df['Date'].max()}")
        
        return price_df
    except Exception as e:
        print(f"Error downloading price data: {e}")
        return None

def calculate_2day_returns(price_df):
    """
    Calculate 2-day percentage changes for all historical dates
    For each sequence of 3 consecutive trading days (Day 1, Day 2, Day 3),
    compute return as Close_Day3 / Close_Day1 - 1
    This creates a reference dataset of all possible 2-day returns
    """
    print("Step 3: Calculating 2-day percentage changes for all dates...")
    
    # Sort by date to ensure proper order
    price_df = price_df.sort_values('Date').reset_index(drop=True)
    
    returns_data = []
    
    # Calculate 2-day returns for each sequence of 3 consecutive days
    for i in range(len(price_df) - 2):
        day1_close = price_df.loc[i, 'Close']
        day3_close = price_df.loc[i + 2, 'Close']
        
        # Calculate return as (Close_Day3 / Close_Day1) - 1
        return_2day = (day3_close / day1_close) - 1
        
        returns_data.append({
            'Day1_Date': price_df.loc[i, 'Date'],
            'Day2_Date': price_df.loc[i + 1, 'Date'],
            'Day3_Date': price_df.loc[i + 2, 'Date'],
            'Day1_Close': day1_close,
            'Day3_Close': day3_close,
            'Return_2Day': return_2day,
            'Return_2Day_Percent': return_2day * 100.0
        })
    
    returns_df = pd.DataFrame(returns_data)
    print(f"Calculated {len(returns_df)} 2-day returns for reference")
    
    return returns_df

def identify_positive_surprises(earnings_df):
    """
    Identify positive earnings surprises where:
    - actual EPS > estimated EPS OR
    - Surprise (%) > 0
    """
    print("Step 4: Identifying positive earnings surprises...")
    
    # Convert relevant columns to numeric, handling any non-numeric values
    earnings_df['Actual EPS'] = pd.to_numeric(earnings_df['Actual EPS'], errors='coerce')
    earnings_df['Estimated EPS'] = pd.to_numeric(earnings_df['Estimated EPS'], errors='coerce')
    earnings_df['Surprise (%)'] = pd.to_numeric(earnings_df['Surprise (%)'], errors='coerce')
    
    # Identify positive surprises
    condition1 = earnings_df['Actual EPS'] > earnings_df['Estimated EPS']
    condition2 = earnings_df['Surprise (%)'] > 0
    
    positive_surprises = earnings_df[condition1 | condition2].copy()
    
    print(f"Found {len(positive_surprises)} positive earnings surprises out of {len(earnings_df)} total")
    
    return positive_surprises

def calculate_returns_following_earnings(earnings_df, price_df):
    """
    Calculate 2-day percentage changes FOLLOWING positive earnings surprises
    For each earnings date, find the 2-day return starting from that date:
    Return = Close(earnings_date + 2 days) / Close(earnings_date) - 1
    """
    print("Step 5: Calculating 2-day returns following earnings announcements...")
    
    # Sort price data by date
    price_df = price_df.sort_values('Date').reset_index(drop=True)
    
    matched_data = []
    
    for _, earnings_row in earnings_df.iterrows():
        earnings_date = earnings_row['Date']
        
        # Find the price on earnings date (or closest trading day)
        earnings_price_idx = None
        min_diff = float('inf')
        
        for i, price_date in enumerate(price_df['Date']):
            diff = abs((price_date - earnings_date).days)
            if diff < min_diff:
                min_diff = diff
                earnings_price_idx = i
        
        # Check if we found a reasonable match (within 3 days)
        if earnings_price_idx is not None and min_diff <= 3:
            # Find price 2 trading days later
            if earnings_price_idx + 2 < len(price_df):
                start_price = price_df.loc[earnings_price_idx, 'Close']
                end_price = price_df.loc[earnings_price_idx + 2, 'Close']
                start_date = price_df.loc[earnings_price_idx, 'Date']
                end_date = price_df.loc[earnings_price_idx + 2, 'Date']
                
                # Calculate 2-day return following earnings
                return_2day = (end_price / start_price) - 1
                return_2day_percent = return_2day * 100.0
                
                matched_data.append({
                    'Earnings_Date': earnings_date,
                    'Actual_EPS': earnings_row['Actual EPS'],
                    'Estimated_EPS': earnings_row['Estimated EPS'],
                    'Surprise_Percent': earnings_row['Surprise (%)'],
                    'Start_Date': start_date,
                    'End_Date': end_date,
                    'Start_Price': start_price,
                    'End_Price': end_price,
                    'Return_2Day_Percent': return_2day_percent
                })
    
    matched_df = pd.DataFrame(matched_data)
    print(f"Successfully calculated 2-day returns following {len(matched_df)} earnings announcements")
    
    return matched_df

def calculate_median_return(matched_df):
    """
    Calculate median 2-day percentage change following positive earnings surprises
    """
    print("Step 6: Calculating median 2-day percentage change...")
    
    if matched_df.empty:
        print("No matched data available for median calculation")
        return None
    
    returns = matched_df['Return_2Day_Percent'].values
    median_return = np.median(returns)
    
    print(f"\n=== RESULTS ===")
    print(f"Number of positive earnings surprises with matched returns: {len(returns)}")
    print(f"Median 2-day percentage change: {median_return:.2f}%")
    
    # Additional statistics
    print(f"\nAdditional Statistics:")
    print(f"Mean return: {np.mean(returns):.2f}%")
    print(f"Standard deviation: {np.std(returns):.2f}%")
    print(f"Min return: {np.min(returns):.2f}%")
    print(f"Max return: {np.max(returns):.2f}%")
    print(f"25th percentile: {np.percentile(returns, 25):.2f}%")
    print(f"75th percentile: {np.percentile(returns, 75):.2f}%")
    
    return median_return, matched_df

def main():
    """
    Main function to execute the complete analysis
    """
    print("=== Earnings Surprise Analysis ===\n")
    
    # Step 1: Load earnings data
    earnings_df = load_earnings_data("ha1_Amazon.csv")
    if earnings_df is None:
        return
    
    # Step 2: Download price data
    price_df = download_price_data('AMZN')
    if price_df is None:
        return
    
    # Step 3: Calculate 2-day returns (for reference)
    returns_df = calculate_2day_returns(price_df)
    
    # Step 4: Identify positive surprises
    positive_surprises_df = identify_positive_surprises(earnings_df)
    
    # Step 5: Calculate 2-day returns FOLLOWING positive earnings surprises
    matched_df = calculate_returns_following_earnings(positive_surprises_df, price_df)
    
    # Step 6: Calculate median return
    if not matched_df.empty:
        median_return, final_df = calculate_median_return(matched_df)
        
        # Display sample of the matched data
        print(f"\nSample of positive surprises with following 2-day returns:")
        print(final_df[['Earnings_Date', 'Actual_EPS', 'Estimated_EPS', 
                       'Surprise_Percent', 'Start_Date', 'End_Date', 'Return_2Day_Percent']].head(10))
        
        return median_return, final_df
    else:
        print("No matching data found!")
        return None, None

# Execute the analysis
if __name__ == "__main__":
    # Install required packages if not already installed
    # pip install pandas yfinance numpy
    
    median_return, results_df = main()
    
    if median_return is not None:
        print(f"\n🎯 FINAL ANSWER: {median_return:.2f}%")
    else:
        print("\n❌ Analysis failed - please check your data file and try again.")