# Collect Data for Test 2

In [None]:
import pandas as pd
from datetime import datetime
from data_collector import collect_data_inkl_news, determine_trend, get_news
import os
import time

In [None]:
symbol = "SPY"
start_year = 2015
end_year = 2023

## Training Data

In [None]:
periods = [(datetime(year, 1, 1), datetime(year, 12, 31)) for year in range(2016, end_year)]

folder_path = "../Data/spy_data"
os.makedirs(folder_path, exist_ok=True)

for start, end in periods:
    df = collect_data_inkl_news(
    symbol=symbol,
    start=start,
    end=end,
    alpaca_symbol="SPY" 
    )
    df.to_csv(f"{folder_path}/spy_{start}-{end}.csv")
    
    # Avoid API overflow (only 200 free calls per minute)
    time.sleep(30)
    


In [None]:
folder_path = "../Data/spy_data"
spy_files = os.listdir(folder_path)

df_list = []
for file in spy_files:
    df = pd.read_csv(f"{folder_path}/{file}")
    df.drop(columns=["Unnamed: 0"], inplace=True)
    df_list.append(df)

df_spy = pd.concat(df_list)
df_spy.reset_index(inplace=True)
df_spy = df_spy[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df_spy["news_probability"] = df_spy["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))
df_spy.to_csv(f"{folder_path}/spy_concat.csv")
display(df_spy)


In [None]:
df_spy["trend"] = df_spy.apply(determine_trend, axis=1)
df_spy["sentiment_int"] = df_spy.apply(sentiment_int, axis=1)
df_train = df_spy.drop(columns=["Date", "High", "Low", "Adj Close", "Volume", "news_sentiment"])

df_train['Invest'] = 0

for index, row in df_train.iterrows():
    if index > 0: 
        if df_train.at[index, "trend"] == 1:
            df_train.at[index-1, "Invest"] = True
        else:
            df_train.at[index-1, "Invest"] = False

df_train["month"] = df_train["month"].astype(int)
df_train["weekday"] = df_train["weekday"].astype(int)
df_train["Invest"] = df_train["Invest"].astype(bool)

df_train.to_pickle("../Data/spy_train_data.pkl")
display(df_train)

## Test Data

In [None]:
folder_path = "../Data/test_data"
os.makedirs(folder_path, exist_ok=True)

df = collect_data_inkl_news(
    symbol=symbol,
    start=datetime(2023, 1, 1),
    end=datetime(2023, 12, 31),
    alpaca_symbol="SPY" 
)
df.to_csv(f"{folder_path}/spy.csv")

In [None]:
df = pd.read_csv(f"{folder_path}/spy.csv")

df.reset_index(inplace=True)
df = df[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]

df["trend"] = df.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["Date", "High", "Low", "Adj Close", "Volume", "news_sentiment"])

df['Invest'] = 0

for index, row in df.iterrows():
    if index > 0: 
        if df.at[index, "trend"] == 1:
            df.at[index-1, "Invest"] = True
        else:
            df.at[index-1, "Invest"] = False

df["month"] = df["month"].astype(int)
df["weekday"] = df["weekday"].astype(int)
df["Invest"] = df["Invest"].astype(bool)


df.to_pickle(f"../Data/spy_test_data.pkl")
display(df)

df.iloc[:, :-1][:30]