In [9]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 120)
START = "2022-01-01"
END = "2025-12-18"

In [12]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: Building 'multitasking' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'multitasking'. Discussion can be found at https://github.com/pypa/pip/issues/6334


Collecting yfinance
  Downloading yfinance-1.0-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ------ --------------------------------- 0.5/3.0 MB 3.3 MB/s eta 0:00:01
     ----------------- ---------------------- 1.3/3.0 MB 3.6 MB/s eta 0:00:01
     --------------------------- ------------ 2.1/3.0 MB 3.6 MB/s eta 0:00:01
     -------------------------------------- - 2.9/3.0 MB 3.5 MB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 3.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requiremen

In [13]:
# --- Yahoo Finance via yfinance: US tickers (real market yh_df) ---
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
# SPY : S&P 500 index
# QQQ : Nasdaq-100 index
# TLT : U.S. Treasury bonds with 20+ year maturity
# GLD : Physical gold prices
# EEM : MSCI Emerging Markets index

try:
    import yfinance as yf
except Exception as e:
    yf = None
    print("Could not import yfinance:", type(e).__name__, str(e))

if yf is not None:
    try:
        yh_df = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    except Exception as e:
        yh_df = pd.DataFrame()
        print("yfinance download failed:", type(e).__name__, str(e))
else:
    yh_df = pd.DataFrame()

# Convert to long format: date, ticker, close, volume
if isinstance(yh_df, pd.DataFrame) and yh_df.shape[0] > 0:
    if isinstance(yh_df.columns, pd.MultiIndex):
        close = yh_df["Close"].copy()
        vol = yh_df["Volume"].copy()
    else:
        close = yh_df[["Close"]].rename(columns={"Close": tickers[0]})
        vol = yh_df[["Volume"]].rename(columns={"Volume": tickers[0]})

    close.index.name = "date"
    vol.index.name = "date"

    us_close_long = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
    us_vol_long = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
    us_mkt = us_close_long.merge(us_vol_long, on=["date","ticker"], how="inner").dropna(subset=["close"])
else:
    us_mkt = pd.DataFrame(columns=["date","ticker","close","volume"])

us_mkt.head(), us_mkt.shape

(        date ticker      close    volume
 0 2022-01-03    EEM  44.624969  27572700
 1 2022-01-04    EEM  44.470776  24579500
 2 2022-01-05    EEM  43.745167  46425100
 3 2022-01-06    EEM  43.944710  34288700
 4 2022-01-07    EEM  44.343792  32640900,
 (4970, 4))

In [49]:
#3.2.4
spy_series = (
    us_mkt[us_mkt["ticker"] == "SPY"]
    .set_index("date")["close"]
    .rename("SPY_close")
)
spy_series

date
2022-01-03    451.875153
2022-01-04    451.723785
2022-01-05    443.049713
2022-01-06    442.633514
2022-01-07    440.883545
                 ...    
2025-12-11    687.139526
2025-12-12    679.751404
2025-12-15    678.724426
2025-12-16    676.869934
2025-12-17    669.421936
Name: SPY_close, Length: 994, dtype: float64

In [44]:
mid_rate_series = spy_series.rolling(5).mean().rename("mid_rate")
mid_rate_series

date
2022-01-03           NaN
2022-01-04           NaN
2022-01-05           NaN
2022-01-06           NaN
2022-01-07    446.033142
                 ...    
2025-12-11    683.799426
2025-12-12    683.015747
2025-12-15    682.437451
2025-12-16    681.605920
2025-12-17    678.381445
Name: mid_rate, Length: 994, dtype: float64

In [45]:
aligned_df = pd.DataFrame({
    "mid_rate": mid_rate_series,
    "SPY_close": spy_series
})

aligned_df.head()

Unnamed: 0_level_0,mid_rate,SPY_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-03,,451.875153
2022-01-04,,451.723785
2022-01-05,,443.049713
2022-01-06,,442.633514
2022-01-07,446.033142,440.883545


In [50]:
#3.3.6
us_mkt_nan = us_mkt.copy()
us_mkt_nan

Unnamed: 0,date,ticker,close,volume
0,2022-01-03,EEM,44.624969,27572700
1,2022-01-04,EEM,44.470776,24579500
2,2022-01-05,EEM,43.745167,46425100
3,2022-01-06,EEM,43.944710,34288700
4,2022-01-07,EEM,44.343792,32640900
...,...,...,...,...
4965,2025-12-11,TLT,87.848114,26778700
4966,2025-12-12,TLT,87.001404,47030100
4967,2025-12-15,TLT,87.061180,28611800
4968,2025-12-16,TLT,87.539314,41018700


In [30]:
np.random.seed(42)

In [32]:
nan_idx = us_mkt_nan.sample(frac=0.01).index
us_mkt_nan.loc[nan_idx, "close"] = np.nan
nan_idx

Index([1558, 3057, 1249, 2891,   35, 1931,  388, 1568, 2515, 4572, 1682, 3299,
       4520, 1512, 3858, 2928, 4606, 2029, 2903, 2909, 3582,  671, 3781, 2328,
       3011, 1879,  483, 3171, 2522, 1100, 4909, 2124, 4307,  482, 1088,  107,
         39,  570, 3374, 1120, 3205,  150, 1446, 4906, 3772, 2285, 1155, 1084,
       2519, 1147],
      dtype='int64')

In [33]:
us_drop = us_mkt_nan.dropna(subset=["close"])
us_drop

Unnamed: 0,date,ticker,close,volume
0,2022-01-03,EEM,44.624969,27572700
1,2022-01-04,EEM,44.470776,24579500
2,2022-01-05,EEM,43.745167,46425100
3,2022-01-06,EEM,43.944710,34288700
4,2022-01-07,EEM,44.343792,32640900
...,...,...,...,...
4965,2025-12-11,TLT,87.848114,26778700
4966,2025-12-12,TLT,87.001404,47030100
4967,2025-12-15,TLT,87.061180,28611800
4968,2025-12-16,TLT,87.539314,41018700


In [34]:
us_fill = us_mkt_nan.copy()
median_by_ticker = us_fill.groupby("ticker")["close"].transform("median")
us_fill["close"] = us_fill["close"].fillna(median_by_ticker)

us_mkt.shape, us_drop.shape, us_fill.shape

((4970, 4), (4870, 4), (4970, 4))