# Collect Data for Test 2.3

In [None]:
import pandas as pd
from datetime import datetime
from data_collector import collect_data_inkl_news, determine_trend, get_news, sentiment_int
import os
import time

In [None]:
end_year = 2023
symbol = "SPY"

## Training Data

In [None]:
periods = [(datetime(year, 1, 1), datetime(year, 12, 31)) for year in range(2015, end_year)]

folder_path = "../../Data/spy_data"
os.makedirs(folder_path, exist_ok=True)

for start, end in periods:
    df = collect_data_inkl_news(
    symbol=symbol,
    start=start,
    end=end,
    alpaca_symbol="SPY" 
    )
    df.to_csv(f"{folder_path}/spy_{start}-{end}.csv")
    
    # Avoid API overflow (only 200 free calls per minute)
    #time.sleep(30)
    


In [None]:
folder_path = "../../Data/spy_data"
spy_files = os.listdir(folder_path)

df_list = []
for file in spy_files:
    if file == "spy_concat.csv":
        continue
    df = pd.read_csv(f"{folder_path}/{file}")
    df.drop(columns=["Unnamed: 0"], inplace=True)
    df_list.append(df)

df_spy = pd.concat(df_list)
df_spy.reset_index(inplace=True)
df_spy = df_spy[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df_spy["news_probability"] = df_spy["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))
df_spy.to_csv(f"{folder_path}/spy_concat.csv")
display(df_spy)


In [None]:
df_spy["trend"] = df_spy.apply(determine_trend, axis=1)
df_spy["sentiment_int"] = df_spy.apply(sentiment_int, axis=1)
df_train = df_spy.drop(columns=["Open", "Close", "month", "weekday", "High", "Low", "Adj Close", "Volume", "news_sentiment"])

df_train["positive"] = (df_train["sentiment_int"] == 1) * df_train["news_probability"]
df_train["negative"] = (df_train["sentiment_int"] == -1) * df_train["news_probability"]
df_train["neutral"] = (df_train["sentiment_int"] == 0) * df_train["news_probability"]

df_train['Invest'] = 0

for index, row in df_train.iterrows():
    if index > 0: 
        if df_train.at[index, "trend"] == 1:
            df_train.at[index-1, "Invest"] = True
        else:
            df_train.at[index-1, "Invest"] = False




df_train["Invest"] = df_train["Invest"].astype(bool)
df_train = df_train[["Date", "positive", "negative", "neutral", "Invest"]]
df_train.sort_values("Date", inplace=True)

df_train.to_pickle("../../Data/spy_train_data.pkl.gz", compression="gzip")
display(df_train)

## Test Data

In [None]:
folder_path = "../../Data/test_data"
os.makedirs(folder_path, exist_ok=True)

df = collect_data_inkl_news(
    symbol=symbol,
    start=datetime(2022, 11, 1),
    end=datetime(2023, 12, 31),
    alpaca_symbol="SPY" 
)
df.to_csv(f"{folder_path}/spy.csv")

In [None]:
df = pd.read_csv(f"{folder_path}/spy.csv")

df["trend"] = df_spy.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["Open", "Close", "month", "weekday", "High", "Low", "Adj Close", "Volume", "news_sentiment"])
df["news_probability"] = df["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

df["positive"] = (df["sentiment_int"] == 1) * df["news_probability"]
df["negative"] = (df["sentiment_int"] == -1) * df["news_probability"]
df["neutral"] = (df["sentiment_int"] == 0) * df["news_probability"]

df['Invest'] = 0
df.reset_index(inplace=True)

for index, row in df.iterrows():
    if index > 0: 
        if df.at[index, "trend"] == 1:
            df.at[index-1, "Invest"] = True
        else:
            df.at[index-1, "Invest"] = False




df["Invest"] = df["Invest"].astype(bool)
df = df[["Date", "positive", "negative", "neutral", "Invest"]]
df.sort_values("Date", inplace=True)

df.to_pickle("../../Data/spy_test_data.pkl.gz", compression="gzip")
display(df)