In [7]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import RMSprop
from keras.losses import Huber
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler as Ms

def build_model(input_shape, num_stocks=42):
    """
    Builds and compiles an LSTM model.

    Parameters:
    input_shape (tuple): The shape of the input data (time_steps, num_features).
    num_stocks (int): The number of stocks (output dimensions).

    Returns:
    keras.models.Sequential: Compiled LSTM model.
    """
    
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(num_stocks, activation='tanh')  # Using tanh activation
    ])

    model.compile(optimizer=RMSprop(), loss=Huber())  # Using RMSprop and Huber Loss
    print(model.summary())
    return model

def model_atmpt_2(input_shape, num_stocks=5):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape = input_shape))

    model.add(Dropout(0.1)) 
    model.add(LSTM(units=50))

    model.add(Dense(42))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

    print(model.summary())
    return model

def train_model(model, X, y, epochs=20, batch_size=64, validation_split=0.2):
    """
    Trains the LSTM model.

    Parameters:
    model (keras.models.Sequential): The compiled LSTM model.
    X (numpy.ndarray): Input features.
    y (numpy.ndarray): Target values.
    epochs (int): Number of training epochs.
    batch_size (int): Batch size for training.
    validation_split (float): Fraction of data to use for validation.

    Returns:
    keras.callbacks.History: History object containing training history.
    """
    
    history = model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
    return history


def implement_model(df, model, train_seq, train_label, test_seq, test_label, epochs=3, batch_size=64, verbose=1):
    """
    Trains the LSTM model.

    Parameters:
    model (keras.models.Sequential): The compiled LSTM model.
    train_seq (numpy.ndarray): Input features.
    train_label (numpy.ndarray): Target values.
    validataion_data : tuple of test seq and label
    epochs (int): Number of training epochs.
    batch_size (int): Batch size for training.

    Returns:
    keras.callbacks.History: History object containing training history.
    """
    model.fit(train_seq, train_label, epochs=epochs, batch_size=batch_size, validation_data=(test_seq, test_label))
    return model

def print_df(df, filename):
   with open(f"{filename}.txt", "w") as f:
        f.write(df.head(50).to_string())

In [3]:
# Data cleaning, normalizing, feature engineering, scaling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def merge_data(stock_data, gov_data):
    """
    Merges stock data and government spending data on the date for each ticker.
    
    Parameters:
    stock_data (dict): A dictionary with ticker symbols as keys and their corresponding DataFrames as values.
    gov_data (pd.DataFrame): A DataFrame containing the government spending data.
    
    Returns:
    pd.DataFrame: A merged DataFrame with stock data and government spending data.
    """
    print("Stock Data Columns:", stock_data.keys())
    print("USASpending Data Columns:", gov_data.columns.tolist())
    
    # Ensure the Date column in government spending data is datetime
    gov_data['Date'] = pd.to_datetime(gov_data['Date'])


    merged_data = pd.DataFrame()
    for ticker, df in stock_data.items():
        df.reset_index(inplace=True)  # Ensure the date is a column
        df.rename(columns={'t': 'Date', 'o': 'Open', 'h': 'High', 'l': 'Low', 'c': 'Close', 'v': 'Volume'}, inplace=True)
        df['Ticker'] = ticker
        df['Date'] = pd.to_datetime(df['Date'])
        merged = pd.merge(df, gov_data, on='Date', how='inner')  # Merge on the Date column
        merged_data = pd.concat([merged_data, merged], ignore_index=True)
    return merged_data

def split_train_test(df):
    Ms = MinMaxScaler()
    df[df.columns] = Ms.fit_transform(df)

    training_size= round(len(df)*0.80)
    train_data= df[:training_size]
    test_data=df[training_size:]
    return train_data, test_data

 



def create_sequences(df, window_size=30, column_A="Date"):
    """
    Prepares data for LSTM by reshaping it into a 3D array.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing
    window_size (int): The number of lagged time steps.
    
    Returns:
    np.array, np.array: The reshaped features and targets.
    """
    sequences = []
    labels = []
    strt_idx = 0
    for stp_idx in range(window_size, len(df)):
        sequences.append(df.iloc[strt_idx:stp_idx].values)
        labels.append(df.iloc[stp_idx].values)
        strt_idx+=1
    return(np.array(sequences), np.array(labels))
    


    #previous thought which was close but not complete
    print(df)
    values = df.values#drop(columns=[column_A]).values
    X, y = [], []
    for i in range(window_size, len(values)):
        X.append(values[i-window_size:i, :-1])
        y.append(values[i, -1])
    return np.array(X), np.array(y)


In [4]:
# Functions for retrieving and processing Polygon.io data
import requests
import time
import pandas as pd
import numpy as np
import os

POLYGON_API_KEY='q6YjvzTWAp_OkhFvfxwfgrtIVOpddl_V'
POLYGON_API_URL='https://api.polygon.io'

def get_historical_stock_data(ticker, start_date, end_date, POLYGON_API_KEY='q6YjvzTWAp_OkhFvfxwfgrtIVOpddl_V', POLYGON_API_URL='https://api.polygon.io'):
    """
    Fetches historical stock data for a given ticker from Polygon.io.

    Parameters:
    ticker (str): The stock ticker symbol to fetch data for.
    start_date (str): The start date for fetching data in 'YYYY-MM-DD' format.
    end_date (str): The end date for fetching data in 'YYYY-MM-DD' format.

    Returns:
    pd.DataFrame: A DataFrame containing OHLCV data for the ticker.
    """
    
    url = f"{POLYGON_API_URL}/v2/aggs/ticker/{ticker}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=5000&apiKey={POLYGON_API_KEY}"
    try:
        response = requests.get(url)
        response.raise_for_status()

        data = response.json()

        data = response.json()
        if 'results' in data and data['results']:
            df = pd.DataFrame(data['results'])
            # print(df.t[3])
            df['t_a'] = pd.to_datetime(df['t'], unit ="ms", yearfirst=True)
            df["t"] =  df['t_a'].dt.date
            
            
            df.set_index('t', inplace=True)
            # print(type(df.index))
            df.rename(columns={'o': f'o_{ticker}', 'h': f'h_{ticker}', 'l': f'l_{ticker}', 'c': f'c_{ticker}','v':f'v_{ticker}'}, inplace=True)
            # print(df.head())
            
          
            df[f'{ticker}_SMA_10'] = df[f'c_{ticker}'].rolling(window=10).mean()
            df[f'{ticker}_SMA_50'] = df[f'c_{ticker}'].rolling(window=50).mean()
            df[f'{ticker}_Returns'] = df[f'c_{ticker}'].pct_change()
            df.dropna(inplace=True)
            
            return df[[f'o_{ticker}', f'h_{ticker}', f'l_{ticker}', f'c_{ticker}',f'v_{ticker}', f'{ticker}_SMA_10',f'{ticker}_SMA_50',f'{ticker}_Returns']]#df[['o', 'h', 'l', 'c', 'v']]
        else:
            print(f"No data available for {ticker} in the specified date range.")
            return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()
    except ValueError as e:
        print(f"JSON decode error for {ticker}: {e}")
        return pd.DataFrame()

def get_data_for_multiple_tickers(tickers=['NGL', 'TSLA', 'AAPL', 'V', 'NSRGY'], start_date= '2023-10-01', end_date = '2024-12-30'):
    """
    Fetches historical stock data for multiple tickers from Polygon.io.

    Parameters:
    tickers (list): A list of stock ticker symbols to fetch data for.
    start_date (str): The start date for fetching data in 'YYYY-MM-DD' format.
    end_date (str): The end date for fetching data in 'YYYY-MM-DD' format.

    Returns:
    dict: A dictionary with ticker symbols as keys and their corresponding DataFrames as values.
    """
    
    stock_data = {}
    for ticker in tickers:
        data = get_historical_stock_data(ticker, start_date, end_date)
        print(f"Fetched data for {ticker}")
        time.sleep(1)
        if not data.empty:
            stock_data[ticker] = data
    return stock_data

def merge_dataframes(starting_df, dict_stock_dfs):
    """ merges the dataframes"""
    merged_data=starting_df
        #identifies the starting dataframe for subsequent merges
    for data_frame in dict_stock_dfs:
        #will iterate through the keys of the dictionaries of stock dataframes, these keys will be the tickers of the stocks
        df_to_add = dict_stock_dfs[data_frame]
            #accesses the current dataframe in the dictionary of stock dataframes  
        merged_data = pd.merge(merged_data, df_to_add, right_index=True, left_index=True)
            #merges the usa spending for each date with the corresponding stock data for that date
            #since the dates are the indicies, the merge occurs on the indicies
            #each stock dataframe has to have column titles that are unique to it's stock so that the stocks can all be in the same dataframe without overwriting eachothers data
                # ie every stock dataframe has data for o h l c and v so we add the stock ticker to the column name as an extra identifier
    merged_data.rename_axis("Date", inplace=True)
        #retains the original index identifier so that the index can be accessed using the keyword "Date" in future code
    return merged_data

In [5]:
import pandas as pd

def get_usaspending_data(filepath='/data/users/mccallke0364/Algorithmic Trading/github/CS-491-Algorithmic-Trading-Project/src/data_collection/usaspending_data.csv'):
    """
    Loads government spending data from a CSV file.
    
    Parameters:
    filepath (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: A DataFrame containing the government spending data.
    """
    df = pd.read_csv(filepath, parse_dates=['Date'], header=0, index_col=0)
    df.index = pd.to_datetime(df.index, unit='ms')
    print_df(df.sort_index(), "usa_spend")
 
    # df.set_index(["Date"])
    # print(type(df.index))
    return df


In [6]:
import os
import numpy as np
import pandas as pd
# from data_collection.polygon_data import *
# #from data_collection.bezinga_data import get_government_trades_data
# from data_collection.usaspending_data import *
# from preprocessing.preprocess_data import *
# from model.lstm_model import *
# from model.utils import *


################---------Collect data from polygon.io and usa spending---------################
dict_stock_dfs = get_data_for_multiple_tickers() 
    # a dictionary of each stocks dataframe 
    # set up with the default stocks of ['NGL', 'TSLA', 'AAPL', 'V', 'NSRGY'] and dates start_date = '2023-10-01' end_date = '2024-12-30' 
usa_spending_data = get_usaspending_data() 
    # dataframe of the usa spending data for each date 
    # set up with the default path to be 'data_collection/usaspending_data.csv'
usa_spending_data = usa_spending_data.filter(["total_obligations",  "total_outlayed_amount"])
full_dataframe = merge_dataframes(usa_spending_data, dict_stock_dfs)
    #merges all the stock dataframes and usa spending based on the date the data was collected

print_df(full_dataframe, "full_df")

################---------Process data for model---------################
train_data, test_data = split_train_test(full_dataframe)
    #split data 80:20 for training and testing
train_seq, train_label = create_sequences(train_data)
    #creating the sequence for training
test_seq, test_label = create_sequences(test_data)
    #creating the sequence for testing


# print(train_seq)
################---------Build model---------################
# model = model_atmpt_2((train_seq.shape[1], train_seq.shape[2]))
    #building model, automatically takes the default number of stocks, (5), since it is not specified in the call
model = build_model((train_seq.shape[1], train_seq.shape[2]))


# print_df(implement_model(full_dataframe, model, train_seq, train_label, test_seq, test_label), "full_model_1")
    #used train_model_pre_split since the data has already been split into training and testing data
    #anything not specified has a default in the function call


Fetched data for NGL
Fetched data for TSLA
Fetched data for AAPL
Fetched data for V
Fetched data for NSRGY
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 50)            18600     
                                                                 
 dropout (Dropout)           (None, 30, 50)            0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 42)                2142      
                                                                 
Total params: 40942 (159.93 KB)
Trainable params: 40942 (159.93 KB)
Non-trainable

In [None]:
model_2 = model_atmpt_2((train_seq.shape[1], train_seq.shape[2]))

In [26]:
model_frame = implement_model(full_dataframe, model, train_seq, train_label, test_seq, test_label)
print_df(model_frame, "implementdf")

Epoch 1/3
Epoch 2/3
Epoch 3/3


NameError: name 'Ms' is not defined

In [None]:
test_predicted = model.predict(test_seq)
test_p_df= pd.DataFrame(test_predicted)
print_df(test_p_df, "test_predict_a")
test_inverse_predicted = Ms.inverse_transform(test_predicted)
test_i_p_df= pd.DataFrame(test_inverse_predicted)
print_df(test_i_p_df, "inverse_predict_a")
new_df = pd.concat(df, test_p_df)