In [None]:
import pandas as pd
from datetime import datetime
from data_collector import get_finance_data, collect_data_inkl_news, determine_trend, sentiment_int
import os
from functools import reduce
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands

In [None]:
os.makedirs("../Data", exist_ok=True)

# Training Data

In [None]:
train_start_date = datetime(2015, 1, 1)
train_end_date = datetime(2022, 12, 31)
symbol = "SPY"

## Fear and Greed Index

In [None]:
df = get_finance_data(symbol, start=train_start_date, end=train_end_date)
f_and_g = pd.read_csv("https://raw.githubusercontent.com/whit3rabbit/fear-greed-data/main/fear-greed-2011-2023.csv")
df["Date"] = pd.to_datetime(df["Date"].astype(str))
f_and_g["Date"] = pd.to_datetime(f_and_g["Date"])

df = pd.merge(df, f_and_g, on="Date", how="left")
df.reset_index(inplace=True, drop=True)
df.sort_values(["Date"], inplace=True)

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["Y"] = None
for index, row in df.iterrows():
    if index > 0:
        df.at[index-1, "Y"] = df.at[index, "Close"]
df.drop(df.tail(1).index,inplace=True)
       
df["month"] = df["month"].astype(int)
df.dropna(inplace=True)
df.sort_values(["Date"], inplace=True)

df.to_pickle("../Data/train_f_and_g.pkl")
display(df)

## News

In [None]:
start_year = train_start_date.year
end_year = train_end_date.year

folder_path = "../Data/news_train"

In [None]:
periods = [(datetime(year, 1, 1), datetime(year, 12, 31)) for year in range(start_year, end_year+1)]

os.makedirs(folder_path, exist_ok=True)

for start, end in periods:
    df = collect_data_inkl_news(
    symbol=symbol,
    start=start,
    end=end,
    alpaca_symbol="SPY" 
    )
    df.to_csv(f"{folder_path}/spy_{start}-{end}.csv")

In [None]:
spy_files = os.listdir(folder_path)

df_list = []
for file in spy_files:
    if file == "spy_concat.csv":
        continue
    df = pd.read_csv(f"{folder_path}/{file}")
    df.drop(columns=["Unnamed: 0"], inplace=True)
    df_list.append(df)

df = pd.concat(df_list)
df.sort_values(["Date"], inplace=True)

df.reset_index(inplace=True)
df = df[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df["news_probability"] = df["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

df["trend"] = df.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["news_sentiment"])

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["moving_avg"] = None
df["Y"] = None

df["Y"] = df["Close"].shift(-1)
df["moving_avg"] = df["Close"].shift(1).rolling(window=3).mean()
df.dropna(inplace=True)

df["month"] = df["month"].astype(int)
df["weekday"] = df["weekday"].astype(int)
df.sort_values(["Date"], inplace=True)

df.to_pickle(f"../Data/train_news.pkl")
display(df)

## Commodities

In [None]:
commodities = [
    "CL=F",  # WTI Crude Oil 
    "BZ=F",  # Brent Crude Oil 
    "GC=F",  # Gold 
    "SI=F",  # Silver 
    "PL=F",  # Platinum 
    "PA=F",  # Palladium 
    "HG=F",  # Copper 
    "ZC=F",  # Corn Futures 
    "ZS=F",  # Soybean Futures 
    "ZW=F",  # Wheat 
    "KC=F",  # Coffee 
    "CC=F",  # Cocoa 
    "SB=F",  # Sugar 
    "NG=F",  # Natural Gas 
    "HO=F",  # Heating Oil 
    # ETFs 
    "GLD",   # SPDR Gold 
    "SLV",   # iShares Silver 
    "USO",   # United States 
    "UNG"    # United States Natural Gas
]

In [None]:
commodities.append(symbol)
results = []

for commodity in commodities:
    df = get_finance_data(commodity, start=train_start_date, end=train_end_date)
    if not commodity == symbol:
        df = df[["Date" ,"Open", "Close"]]
    df = df.add_prefix(f"{commodity}_").rename(columns={f"{commodity}_Date": "Date"})
    results.append(df)
    
df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="inner"), results)
df.rename(columns={
    f"{symbol}_Open": "Open",
    f"{symbol}_Close": "Close",
}, inplace=True)

correlation_results = {}

# Exclude the main symbol from rolling correlation
for commodity in commodities:
    if commodity != symbol:
        commodity_close_col = f"{commodity}_Close"
        rolling_corr = df["Close"].rolling(window=5).corr(df[commodity_close_col])
        correlation_results[commodities.index(commodity)] = rolling_corr

# Create a DataFrame for the rolling correlations
correlation_df = pd.DataFrame(correlation_results)

# For each row, find the commodity with the highest correlation
df["Top_Correlation"] = correlation_df.idxmax(axis=1)
df["Top_Correlation_Value"] = correlation_df.max(axis=1)

df["Y"] = None
df["Y"] = df["Close"].shift(-1)
df.dropna(inplace=True)
df.sort_values(["Date"], inplace=True)

df.to_pickle("../Data/commodities_train.pkl")
display(df)

# Test Data

In [None]:
test_start_date = datetime(2022, 10, 1) # LSTM needs seq_size days before 2023
test_end_date = datetime(2023, 12, 31)
symbol = "SPY"

## Fear and Greed Index

In [None]:
df = get_finance_data(symbol, start=test_start_date, end=test_end_date)
f_and_g = pd.read_csv("https://raw.githubusercontent.com/whit3rabbit/fear-greed-data/main/fear-greed-2011-2023.csv")
df["Date"] = pd.to_datetime(df["Date"].astype(str))
f_and_g["Date"] = pd.to_datetime(f_and_g["Date"])

df = pd.merge(df, f_and_g, on="Date", how="left")
df.reset_index(inplace=True, drop=True)
df.sort_values(["Date"], inplace=True)

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["Y"] = None
for index, row in df.iterrows():
    if index > 0:
        df.at[index-1, "Y"] = df.at[index, "Close"]
df.drop(df.tail(1).index,inplace=True)
       
df["month"] = df["month"].astype(int)
df.dropna(inplace=True)
df.sort_values(["Date"], inplace=True)

df.to_pickle("../Data/test_f_and_g.pkl")
display(df)

## News

In [None]:
start_year = test_start_date.year
end_year = test_end_date.year

folder_path = "../Data/news_test"

In [None]:
periods = [(datetime(year, 1, 1), datetime(year, 12, 31)) for year in range(start_year, end_year+1)]
print(periods)
os.makedirs(folder_path, exist_ok=True)

for start, end in periods:
    df = collect_data_inkl_news(
    symbol=symbol,
    start=start,
    end=end,
    alpaca_symbol="SPY" 
    )
    df.to_csv(f"{folder_path}/spy_{start}-{end}.csv")

In [None]:
spy_files = os.listdir(folder_path)

df_list = []
for file in spy_files:
    if file == "spy_concat.csv":
        continue
    df = pd.read_csv(f"{folder_path}/{file}")
    df.drop(columns=["Unnamed: 0"], inplace=True)
    df_list.append(df)

df = pd.concat(df_list)
df.sort_values(["Date"], inplace=True)

df.reset_index(inplace=True)
df = df[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "news_probability", "news_sentiment"]]
df["news_probability"] = df["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

df["trend"] = df.apply(determine_trend, axis=1)
df["sentiment_int"] = df.apply(sentiment_int, axis=1)
df = df.drop(columns=["news_sentiment"])

df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()
macd = MACD(close=df['Close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
bollinger = BollingerBands(close=df['Close'], window=20)
df['bollinger_hband'] = bollinger.bollinger_hband()
df['bollinger_lband'] = bollinger.bollinger_lband()

df["moving_avg"] = None
df["Y"] = None

df["Y"] = df["Close"].shift(-1)
df["moving_avg"] = df["Close"].shift(1).rolling(window=3).mean()
df.dropna(inplace=True)

df["month"] = df["month"].astype(int)
df["weekday"] = df["weekday"].astype(int)
df.sort_values(["Date"], inplace=True)

df.to_pickle(f"../Data/test_news.pkl")
display(df)

## Commodities

In [None]:
commodities = [
    "CL=F",  # WTI Crude Oil 
    "BZ=F",  # Brent Crude Oil 
    "GC=F",  # Gold 
    "SI=F",  # Silver 
    "PL=F",  # Platinum 
    "PA=F",  # Palladium 
    "HG=F",  # Copper 
    "ZC=F",  # Corn Futures 
    "ZS=F",  # Soybean Futures 
    "ZW=F",  # Wheat 
    "KC=F",  # Coffee 
    "CC=F",  # Cocoa 
    "SB=F",  # Sugar 
    "NG=F",  # Natural Gas 
    "HO=F",  # Heating Oil 
    # ETFs 
    "GLD",   # SPDR Gold 
    "SLV",   # iShares Silver 
    "USO",   # United States 
    "UNG"    # United States Natural Gas
]

In [None]:
commodities.append(symbol)
results = []

for commodity in commodities:
    df = get_finance_data(commodity, start=test_start_date, end=test_end_date)
    if not commodity == symbol:
        df = df[["Date" ,"Open", "Close"]]
    df = df.add_prefix(f"{commodity}_").rename(columns={f"{commodity}_Date": "Date"})
    results.append(df)
    
df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="inner"), results)
df.rename(columns={
    f"{symbol}_Open": "Open",
    f"{symbol}_Close": "Close",
}, inplace=True)

correlation_results = {}

# Exclude the main symbol from rolling correlation
for commodity in commodities:
    if commodity != symbol:
        commodity_close_col = f"{commodity}_Close"
        rolling_corr = df["Close"].rolling(window=5).corr(df[commodity_close_col])
        correlation_results[commodities.index(commodity)] = rolling_corr

# Create a DataFrame for the rolling correlations
correlation_df = pd.DataFrame(correlation_results)

# For each row, find the commodity with the highest correlation
df["Top_Correlation"] = correlation_df.idxmax(axis=1)
df["Top_Correlation_Value"] = correlation_df.max(axis=1)

df["Y"] = None
df["Y"] = df["Close"].shift(-1)
df.dropna(inplace=True)
df.sort_values(["Date"], inplace=True)

df.to_pickle("../Data/commodities_test.pkl")
display(df)