In [2]:
from pathlib import Path
import pandas as pd

PRICE_DIR = Path("price/raw")

In [None]:
price_dfs = []

for f in PRICE_DIR.iterdir():
    if f.suffix != ".csv":
        continue
    
    ticker = f.stem.upper()
    df = pd.read_csv(f)
    df["ticker"] = ticker
    price_dfs.append(df)

price_raw = pd.concat(price_dfs, ignore_index=True)
price_raw.head()

In [4]:
# standardize column names
price_raw.columns = (
    price_raw.columns.str.lower().str.replace(" ", "_"))

# parse dates and sort
price_raw["date"] = pd.to_datetime(price_raw["date"])
price_raw = price_raw.sort_values(["ticker", "date"])

# daily & multi-day returns (targets)
price_raw["daily_return"] = price_raw.groupby("ticker")["adj_close"].pct_change()

price_raw["ret_1d"] = price_raw.groupby("ticker")["adj_close"].pct_change().shift(-1)
price_raw["ret_3d"] = price_raw.groupby("ticker")["adj_close"].pct_change(3).shift(-3)
price_raw["ret_5d"] = price_raw.groupby("ticker")["adj_close"].pct_change(5).shift(-5)
price_raw["ret_7d"] = price_raw.groupby("ticker")["adj_close"].pct_change(7).shift(-7)

price_clean = price_raw.dropna(subset=["daily_return"])

price_clean.head(), price_clean.shape

  price_raw["daily_return"] = price_raw.groupby("ticker")["adj_close"].pct_change()
  price_raw["ret_1d"] = price_raw.groupby("ticker")["adj_close"].pct_change().shift(-1)
  price_raw["ret_3d"] = price_raw.groupby("ticker")["adj_close"].pct_change(3).shift(-3)
  price_raw["ret_5d"] = price_raw.groupby("ticker")["adj_close"].pct_change(5).shift(-5)
  price_raw["ret_7d"] = price_raw.groupby("ticker")["adj_close"].pct_change(7).shift(-7)


(            date       open       high        low      close  adj_close  \
 99086 2012-09-05  96.510002  96.621429  95.657143  95.747147  86.509338   
 99087 2012-09-06  96.167145  96.898575  95.828575  96.610001  87.288956   
 99088 2012-09-07  96.864288  97.497147  96.538574  97.205711  87.827171   
 99089 2012-09-10  97.207146  97.612854  94.585716  94.677139  85.542564   
 99090 2012-09-11  95.015717  95.728569  93.785713  94.370003  85.265068   
 
             volume ticker  daily_return    ret_1d    ret_3d    ret_5d  \
 99086   84093800.0   AAPL     -0.007022  0.009012 -0.011175 -0.000657   
 99087   97799100.0   AAPL      0.009012  0.006166 -0.023186  0.009922   
 99088   82416600.0   AAPL      0.006166 -0.026013 -0.015652  0.015931   
 99089  121999500.0   AAPL     -0.026013 -0.003244  0.030540  0.055889   
 99090  125995800.0   AAPL     -0.003244  0.013927  0.046458  0.062550   
 
          ret_7d  
 99086  0.031407  
 99087  0.034764  
 99088  0.031553  
 99089  0.059390  
 

In [5]:
price_clean.to_pickle("price_clean.pkl")
price_clean.to_csv("price_clean.csv", index=False)