In [1]:
import pandas as pd
import requests
import time
import numpy as np
import os

In [39]:

def get_usaspending_data(filepath='../src/data_collection/usaspending_data.csv'):
    """
    Loads government spending data from a CSV file.
    
    Parameters:
    filepath (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: A DataFrame containing the government spending data.
    """
    df = pd.read_csv(filepath, parse_dates=['Date'], header=0, index_col=0)
    df.index = pd.to_datetime(df.index, unit='ms')
    print_df(df.sort_index(), "usa_spend")
 
    # df.set_index(["Date"])
    # print(type(df.index))
    return df

def print_df(df, filename, location="dataframes/"):
    with open(f"{location}{filename}.txt", "w") as f:
        f.write(df.to_string())
    with open(f"{location}{filename}.csv", "w") as f_csv:
        f_csv.write(df.to_csv())

POLYGON_API_KEY='q6YjvzTWAp_OkhFvfxwfgrtIVOpddl_V'
POLYGON_API_URL='https://api.polygon.io'

def get_historical_stock_data(ticker, start_date, end_date, POLYGON_API_KEY='q6YjvzTWAp_OkhFvfxwfgrtIVOpddl_V', POLYGON_API_URL='https://api.polygon.io'):
    """
    Fetches historical stock data for a given ticker from Polygon.io.

    Parameters:
    ticker (str): The stock ticker symbol to fetch data for.
    start_date (str): The start date for fetching data in 'YYYY-MM-DD' format.
    end_date (str): The end date for fetching data in 'YYYY-MM-DD' format.

    Returns:
    pd.DataFrame: A DataFrame containing OHLCV data for the ticker.
    """
    
    url = f"{POLYGON_API_URL}/v2/aggs/ticker/{ticker}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=5000&apiKey={POLYGON_API_KEY}"
    try:
        response = requests.get(url)
        response.raise_for_status()

        data = response.json()

        data = response.json()
        if 'results' in data and data['results']:
            df = pd.DataFrame(data['results'])
            # print(df.t[3])
            df['t_a'] = pd.to_datetime(df['t'], unit ="ms", yearfirst=True)
            df["t"] =  df['t_a'].dt.date
            
            
            df.set_index('t', inplace=True)
            # print(type(df.index))
            df.rename(columns={'o': f'o_{ticker}', 'h': f'h_{ticker}', 'l': f'l_{ticker}', 'c': f'c_{ticker}','v':f'v_{ticker}'}, inplace=True)
            # print(df.head())
            
          
            df[f'{ticker}_SMA_10'] = df[f'c_{ticker}'].rolling(window=10).mean()
            df[f'{ticker}_SMA_50'] = df[f'c_{ticker}'].rolling(window=50).mean()
            df[f'{ticker}_Returns'] = df[f'c_{ticker}'].pct_change()
            df.dropna(inplace=True)
            print_df(df, ticker)
            return df[[f'o_{ticker}', f'h_{ticker}', f'l_{ticker}', f'c_{ticker}',f'v_{ticker}', f'{ticker}_SMA_10',f'{ticker}_SMA_50',f'{ticker}_Returns']]#df[['o', 'h', 'l', 'c', 'v']]
        else:
            print(f"No data available for {ticker} in the specified date range.")
            return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()
    except ValueError as e:
        print(f"JSON decode error for {ticker}: {e}")
        return pd.DataFrame()

def get_data_for_multiple_tickers(tickers=['NGL', 'TSLA', 'AAPL', 'V', 'NSRGY'], start_date= '2023-10-01', end_date = '2024-12-30'):
    """
    Fetches historical stock data for multiple tickers from Polygon.io.

    Parameters:
    tickers (list): A list of stock ticker symbols to fetch data for.
    start_date (str): The start date for fetching data in 'YYYY-MM-DD' format.
    end_date (str): The end date for fetching data in 'YYYY-MM-DD' format.

    Returns:
    dict: A dictionary with ticker symbols as keys and their corresponding DataFrames as values.
    """
    
    stock_data = {}
    for ticker in tickers:
        data = get_historical_stock_data(ticker, start_date, end_date)
        print(f"Fetched data for {ticker}")
        time.sleep(1)
        if not data.empty:
            stock_data[ticker] = data
    return stock_data

def get_data_for_all_stocks():
    dict={}
    for a in os.listdir("dataframes"):
        if a.endswith(".csv"):
            if a!="usa_spend.csv":
                date = "t"
                df = pd.read_csv(f"dataframes/{a}", parse_dates=[date], header=0, index_col=0)
                dict[a[:-4]] = df
            else:
                date = "Date"
                dataframe = pd.read_csv(f"dataframes/{a}", parse_dates=[date], header=0, index_col=0)
                continue
        
    return dict, dataframe

def merge_dataframes(starting_df, dict_stock_dfs):
    """ merges the dataframes"""
    merged_data=starting_df
        #identifies the starting dataframe for subsequent merges
    for data_frame in dict_stock_dfs:
        #will iterate through the keys of the dictionaries of stock dataframes, these keys will be the tickers of the stocks
        df_to_add = dict_stock_dfs[data_frame]
            #accesses the current dataframe in the dictionary of stock dataframes  
        merged_data = pd.merge(merged_data, df_to_add, right_index=True, left_index=True)
            #merges the usa spending for each date with the corresponding stock data for that date
            #since the dates are the indicies, the merge occurs on the indicies
            #each stock dataframe has to have column titles that are unique to it's stock so that the stocks can all be in the same dataframe without overwriting eachothers data
                # ie every stock dataframe has data for o h l c and v so we add the stock ticker to the column name as an extra identifier
    merged_data.rename_axis("Date", inplace=True)
        #retains the original index identifier so that the index can be accessed using the keyword "Date" in future code
    return merged_data

In [23]:
# time.sleep(60)
# get_data_for_multiple_tickers()
# time.sleep(60)
# get_data_for_multiple_tickers(["MSFT", "GOOGL", "AMZN", "META", "NFLX"])
# time.sleep(60)
# get_data_for_multiple_tickers(["NVDA", "BRK.B", "JNJ", "FB", "WMT"])
# time.sleep(60)
# get_data_for_multiple_tickers(["PG", "JPM", "MA", "UNH", "HD"])
# time.sleep(60)
# get_data_for_multiple_tickers(["BAC", "XOM", "PFE", "KO", "DIS"])
# time.sleep(60)
# get_data_for_multiple_tickers(["CSCO", "PEP", "ADBE", "CMCSA", "INTC"])
# time.sleep(60)
# get_data_for_multiple_tickers(["T", "VZ", "ABT", "CRM", "LLY"])
# time.sleep(60)
# get_data_for_multiple_tickers(["MRK","TMO", "COST", "ACN", "WFC"])
# time.sleep(60)
# get_data_for_multiple_tickers(["ORCL", "MCD", "AMD", "BA"])
# usa_spending_data = get_usaspending_data()

In [37]:
dict_stock_dfs, usa_spending_data = get_data_for_all_stocks()
    # a dictionary of each stocks dataframe 
    # set up with the default stocks of ['NGL', 'TSLA', 'AAPL', 'V', 'NSRGY'] and dates start_date = '2023-10-01' end_date = '2024-12-30' 

In [40]:
full_dataframe = merge_dataframes(usa_spending_data, dict_stock_dfs)
    #merges all the stock dataframes and usa spending based on the date the data was collected

print_df(full_dataframe, "full_df", location="created_dataframes/" )

KeyboardInterrupt: 