# Collect Data for Test 2.3

In [None]:
import pandas as pd
from datetime import datetime
from data_collector import collect_data_inkl_news, determine_trend, sentiment_int
import os
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands


In [None]:
end_year = 2023
symbol = "SPY"

## Training Data

In [None]:
periods = [(datetime(year, 1, 1), datetime(year, 12, 31)) for year in range(2015, end_year)]

folder_path = "../../Data/spy_data"
os.makedirs(folder_path, exist_ok=True)

for start, end in periods:
    df = collect_data_inkl_news(
    symbol=symbol,
    start=start,
    end=end,
    alpaca_symbol="SPY" 
    )
    df.to_csv(f"{folder_path}/spy_{start}-{end}.csv")
    
    # Avoid API overflow (only 200 free calls per minute)
    #time.sleep(30)
    


In [None]:
folder_path = "../../Data/"
spy_files = os.listdir(f"{folder_path}spy_data")

df_list = []
for file in spy_files:
    if file == "spy_concat.csv":
        continue
    df = pd.read_csv(f"{folder_path}spy_data/{file}")
    df.drop(columns=["Unnamed: 0"], inplace=True)
    df_list.append(df)

df = pd.concat(df_list)

df.reset_index(inplace=True)
df = df[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df["news_probability"] = df["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

df["trend"] = df.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["news_sentiment"])

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["moving_avg"] = None
df["moving_avg"] = df["Close"].shift(1).rolling(window=3).mean()

df["month"] = df["month"].astype(int)
df["weekday"] = df["weekday"].astype(int)

df["positive"] = (df["sentiment_int"] == 1) * df["news_probability"]
df["negative"] = (df["sentiment_int"] == -1) * df["news_probability"]
df["neutral"] = (df["sentiment_int"] == 0) * df["news_probability"]

df["Y"] = None
df["Y"] = df["Close"].shift(-1)
df.dropna(inplace=True)
df.drop(columns=["news_probability", "sentiment_int"], inplace=True)

df.sort_values("Date", inplace=True)

df.to_pickle(f"{folder_path}train_spy_data.pkl")
display(df)

## Test Data

In [None]:
folder_path = "../../Data/test_data"
os.makedirs(folder_path, exist_ok=True)

df = collect_data_inkl_news(
    symbol=symbol,
    start=datetime(2022, 11, 1),
    end=datetime(2023, 12, 31),
    alpaca_symbol="SPY" 
)
df.to_csv(f"{folder_path}/spy.csv")

In [None]:
df = pd.read_csv(f"{folder_path}/spy.csv")

df.reset_index(inplace=True)
df = df[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df["news_probability"] = df["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

df["trend"] = df.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["news_sentiment"])

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["moving_avg"] = None
df["moving_avg"] = df["Close"].shift(1).rolling(window=3).mean()

df["month"] = df["month"].astype(int)
df["weekday"] = df["weekday"].astype(int)

df["positive"] = (df["sentiment_int"] == 1) * df["news_probability"]
df["negative"] = (df["sentiment_int"] == -1) * df["news_probability"]
df["neutral"] = (df["sentiment_int"] == 0) * df["news_probability"]

df["Y"] = None
df["Y"] = df["Close"].shift(-1)
df.dropna(inplace=True)
df.drop(columns=["news_probability", "sentiment_int"], inplace=True)

df.sort_values("Date", inplace=True)

df.to_pickle(f"{folder_path}test_spy_data.pkl")
print(df.columns)
display(df)