In [3]:
import pandas as pd

# Get S&P 500 company table from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url)
df = tables[0]

# Check column for date added
if 'Date added' in df.columns:
    df['Date added'] = pd.to_datetime(df['Date added'], errors='coerce')
    df['Year added'] = df['Date added'].dt.year
    year_counts = df['Year added'].value_counts().drop(1957, errors='ignore')
    most_additions_year = year_counts.idxmax()
    print("Year with most additions:", most_additions_year)
else:
    print("No 'Date added' column found.")


Year with most additions: 2017


In [5]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.61-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py312-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.1.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting curl_cffi>=0.7 (from yfinance)
  Downloading curl_cffi-0.11.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting protobuf>=3.19.0 (from yfinance)
  Downloading protobuf-6.31.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting websockets>=13.0 (f

In [8]:
import yfinance as yf

tickers = {
    'US': '^GSPC', 'China': '000001.SS', 'Hong Kong': '^HSI', 'Australia': '^AXJO',
    'India': '^NSEI', 'Canada': '^GSPTSE', 'Germany': '^GDAXI', 'UK': '^FTSE',
    'Japan': '^N225', 'Mexico': '^MXX', 'Brazil': '^BVSP'
}

returns = {}

for country, ticker in tickers.items():
    df = yf.download(ticker, start='2025-01-01', end='2025-05-01', progress=False, auto_adjust=True)
    
    if not df.empty and 'Close' in df.columns and len(df) > 1:
        try:
            start_price = float(df['Close'].iloc[0])
            end_price = float(df['Close'].iloc[-1])
            returns[country] = (end_price - start_price) / start_price
        except Exception as e:
            print(f"{country}: Error computing return: {e}")
    else:
        print(f"{country}: Data not available")

# Safely compare only if US return exists
us_return = returns.get('US', None)
if us_return is not None:
    better_than_us = [k for k, v in returns.items() if k != 'US' and isinstance(v, float) and v > us_return]
    
    print("\nS&P 500 YTD return:", round(us_return * 100, 2), "%")
    print("Indexes that performed better:", better_than_us)
    print("Count:", len(better_than_us))
else:
    print("Could not retrieve US return.")



S&P 500 YTD return: -5.1 %
Indexes that performed better: ['China', 'Hong Kong', 'Australia', 'India', 'Canada', 'Germany', 'UK', 'Mexico', 'Brazil']
Count: 9


  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])
  start_price = float(df['Close'].iloc[0])
  end_price = float(df['Close'].iloc[-1])


In [8]:
url = 'https://stooq.com/q/d/l/?s=^spx&i=d'
df = pd.read_csv(url)
df.to_csv('data/sp500_stooq.csv', index=False)


In [10]:
import pandas as pd

# Load the data
df = pd.read_csv("data/sp500_stooq.csv", parse_dates=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# Keep only 'Date' and 'Close'
df = df[["Date", "Close"]].dropna()

# Ensure 'Close' is numeric
df["Close"] = pd.to_numeric(df["Close"], errors="coerce")

# Calculate All-Time High (ATH) and drawdowns
df["ATH"] = df["Close"].cummax()
df["Drawdown"] = (df["Close"] - df["ATH"]) / df["ATH"]

# Identify corrections >5% drawdown
corrections = []
in_correction = False
start_date = None

for i in range(1, len(df)):
    if not in_correction and df.loc[i, "Drawdown"] < -0.05:
        in_correction = True
        start_date = df.loc[i - 1, "Date"]
    elif in_correction and df.loc[i, "Close"] >= df.loc[i - 1, "ATH"]:
        end_date = df.loc[i, "Date"]
        duration = (end_date - start_date).days
        corrections.append(duration)
        in_correction = False

# Analyze durations
durations = pd.Series(corrections)
print(" Correction durations (in days):")
print("25th percentile:", durations.quantile(0.25))
print("Median:", durations.median())
print("75th percentile:", durations.quantile(0.75))


 Correction durations (in days):
25th percentile: 42.75
Median: 112.5
75th percentile: 537.0


In [14]:
import pandas as pd
import yfinance as yf

# Load and clean earnings data
df = pd.read_csv("data/ha1_Amazon.csv", sep=";")
df.rename(columns={
    "Earnings Date": "Date",
    "EPS Estimate": "Estimated EPS",
    "Reported EPS": "Actual EPS"
}, inplace=True)

# Fix date format with EDT time
df["Date"] = pd.to_datetime(df["Date"].str.replace(r"\s+at.*", "", regex=True), errors="coerce")
df["Surprise"] = df["Actual EPS"] > df["Estimated EPS"]
df = df.reset_index(drop=True)

# Download Amazon stock data
amzn = yf.download("AMZN", start="2010-01-01", end="2025-05-02", progress=False)
amzn["2d_return"] = amzn["Close"].shift(-2) / amzn["Close"] - 1
amzn = amzn.reset_index()

# Flatten MultiIndex if needed
if isinstance(amzn.columns, pd.MultiIndex):
    amzn.columns = amzn.columns.get_level_values(0)

# Merge
merged = pd.merge(df, amzn, on="Date", how="inner")

# Calculate medians
positive_median = merged[merged["Surprise"]]["2d_return"].median()
all_median = amzn["2d_return"].median()

print(f"Median 2-day return after positive earnings surprises: {positive_median:.4f}")
print(f" Median 2-day return across all days: {all_median:.4f}")


Median 2-day return after positive earnings surprises: 0.0201
 Median 2-day return across all days: 0.0022
