In [16]:
import yfinance as yf
import pandas as pd

tickers = ["AAPL", "GOOGL", "NVDA"]

start_date = "2005-01-01"
end_date = "2025-12-31"

# Download all tickers at once
data = yf.download(
    tickers,
    start=start_date,
    end=end_date,
    group_by="ticker",
    auto_adjust=False,
    progress=True
)

# Reshape into clean long format
frames = []

for ticker in tickers:
    df = data[ticker].copy()
    df = df.reset_index()
    df["ticker"] = ticker
    frames.append(df[["ticker", "Date", "Open", "High", "Low", "Close"]])

final_df = pd.concat(frames)

# Rename columns
final_df.columns = ["ticker", "date", "open", "high", "low", "close"]

# Sort nicely
final_df = final_df.sort_values(["ticker", "date"])

print(final_df.head())


[*********************100%***********************]  3 of 3 completed

  ticker       date      open      high       low     close
0   AAPL 2005-01-03  1.156786  1.162679  1.117857  1.130179
1   AAPL 2005-01-04  1.139107  1.169107  1.124464  1.141786
2   AAPL 2005-01-05  1.151071  1.165179  1.143750  1.151786
3   AAPL 2005-01-06  1.154821  1.159107  1.130893  1.152679
4   AAPL 2005-01-07  1.160714  1.243393  1.156250  1.236607





In [17]:
print(final_df["date"].max())
print(final_df["date"].min())
print(final_df.tail())
print(final_df.describe)

2025-12-30 00:00:00
2005-01-03 00:00:00
     ticker       date        open        high         low       close
5277   NVDA 2025-12-23  182.970001  189.330002  182.899994  189.210007
5278   NVDA 2025-12-24  187.940002  188.910004  186.589996  188.610001
5279   NVDA 2025-12-26  189.919998  192.690002  188.000000  190.529999
5280   NVDA 2025-12-29  187.710007  188.759995  185.910004  188.220001
5281   NVDA 2025-12-30  188.240005  188.990005  186.929993  187.539993
<bound method NDFrame.describe of      ticker       date        open        high         low       close
0      AAPL 2005-01-03    1.156786    1.162679    1.117857    1.130179
1      AAPL 2005-01-04    1.139107    1.169107    1.124464    1.141786
2      AAPL 2005-01-05    1.151071    1.165179    1.143750    1.151786
3      AAPL 2005-01-06    1.154821    1.159107    1.130893    1.152679
4      AAPL 2005-01-07    1.160714    1.243393    1.156250    1.236607
...     ...        ...         ...         ...         ...         ...
527

In [18]:
import numpy as np

# Method 1: Use transform instead of apply
final_df["daily_return"] = (
    final_df.groupby("ticker")["close"]
    .transform(lambda x: np.log(x / x.shift(1)))
)

# Alternative Method 2: Use pct_change with log
# final_df["daily_return"] = np.log(final_df.groupby("ticker")["close"].pct_change() + 1)

In [19]:
print(final_df.tail())

     ticker       date        open        high         low       close  \
5277   NVDA 2025-12-23  182.970001  189.330002  182.899994  189.210007   
5278   NVDA 2025-12-24  187.940002  188.910004  186.589996  188.610001   
5279   NVDA 2025-12-26  189.919998  192.690002  188.000000  190.529999   
5280   NVDA 2025-12-29  187.710007  188.759995  185.910004  188.220001   
5281   NVDA 2025-12-30  188.240005  188.990005  186.929993  187.539993   

      daily_return  
5277      0.029608  
5278     -0.003176  
5279      0.010128  
5280     -0.012198  
5281     -0.003619  


In [30]:
eps_df = pd.read_csv("av_eps_quarterly.csv")
eps_df.rename(columns={"reportedDate": "earnings_date"}, inplace=True)
eps = eps_df[["symbol","earnings_date"]]
eps

Unnamed: 0,symbol,earnings_date
0,AAPL,30/10/2025
1,AAPL,31/07/2025
2,AAPL,01/05/2025
3,AAPL,30/01/2025
4,AAPL,31/10/2024
...,...,...
245,NVDA,16/02/2006
246,NVDA,09/11/2005
247,NVDA,11/08/2005
248,NVDA,12/05/2005


In [31]:
final_df["date"] = pd.to_datetime(final_df["date"])
eps.loc[:, "earnings_date"] = pd.to_datetime(eps["earnings_date"], dayfirst=True)
print(eps)

    symbol        earnings_date
0     AAPL  2025-10-30 00:00:00
1     AAPL  2025-07-31 00:00:00
2     AAPL  2025-05-01 00:00:00
3     AAPL  2025-01-30 00:00:00
4     AAPL  2024-10-31 00:00:00
..     ...                  ...
245   NVDA  2006-02-16 00:00:00
246   NVDA  2005-11-09 00:00:00
247   NVDA  2005-08-11 00:00:00
248   NVDA  2005-05-12 00:00:00
249   NVDA  2005-02-17 00:00:00

[250 rows x 2 columns]


In [32]:
# Convert both date columns to datetime format before merging
final_df['date'] = pd.to_datetime(final_df['date'])
eps['earnings_date'] = pd.to_datetime(eps['earnings_date'])

# Now perform the merge with both columns as datetime type
final_df = final_df.merge(
    eps,
    left_on=["ticker", "date"],
    right_on=["symbol", "earnings_date"],
    how="left"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eps['earnings_date'] = pd.to_datetime(eps['earnings_date'])


In [33]:
final_df.to_csv("test.csv")

In [34]:
final_df["earnings_date"].notna().sum()

250