In [29]:
#Imports and API pull from Polygon to Parquet files

import pandas as pd
import time
from polygon import RESTClient
import parquet
from datetime import *
import pytz
from google.cloud import storage
import os
import pyarrow.parquet as pq


# Get the current working directory to access file
cwd = os.getcwd()

file_path = os.path.join(cwd, 'key', 'api_key.txt')

# Open the file and pull out value
with open(file_path, 'r') as f:
    api_key = f.read()
f.close()




In [30]:


def get_aggregate_data(
    ticker,
    from_date,
    to_date,
    adjusted=True,
    limit=5000,
):

    """
    Gets the aggregate data for the specified ticker of a top ten AI company, including from date, to date, and an adjusted flag.

    Args:
        ticker (str): The ticker symbol of the security.
        from_date (str): The start date of the data range.
        to_date (str): The end date of the data range.
        adjusted (bool): Whether to use adjusted prices.
        limit (int): The maximum number of data points to return.
    Returns:
        pd.DataFrame: A DataFrame with the aggregate data.

    Data Source:
        Polygon.io
    """

    # Create a RESTClient object
    rest_client = RESTClient(api_key)

    # Get the aggregate data
    get_aggregates = rest_client.get_aggs(
        ticker=ticker,
        multiplier=1,
        timespan="day",
        from_=from_date,
        to=to_date,
        adjusted=adjusted,
        sort="asc",
        limit=limit,
    )

    agg_data = []
    for agg in get_aggregates:
        agg_data.append(agg)

    # Create a dictionary to store the aggregate data
    data = {}
    for column in agg_data[0].__dict__.keys():
        data[column] = [getattr(agg, column) for agg in agg_data]

    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

    # Add the ticker column
    df["ticker"] = ticker

    # Add the date column
    df["date"] = df["timestamp"].apply(lambda x: datetime.fromtimestamp(x / 1000))

    return df




def get_top_10_ai_tickers():

    top_10_ai_tickers = [
        #"NVDA",
       #"AAPL",
        #"GOOGL",
        #"AMZN",
        #"MSFT",
        "FB",
        "INTC",
        "TSM",
        "ASML",
        "CDNS",
    ]

    return top_10_ai_tickers



#Create Dataframe; column names are listed below
all_stocks_df = pd.DataFrame()
    #all_stocks_df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Timestamp', 'Transactions', 'Otc', 'Ticker']

top_10_ai_tickers = get_top_10_ai_tickers()   

for ticker in top_10_ai_tickers:
    df1 = get_aggregate_data(ticker, "2022-01-01", "2023-07-19")
    all_stocks_df = pd.concat([all_stocks_df, df1])

#Write to Parquet file
all_stocks_df.to_parquet(f"stocks1.parquet")

In [36]:
all_stocks_df

Unnamed: 0,open,high,low,close,volume,vwap,timestamp,transactions,otc,ticker,date
0,338.295,341.0816,337.1900,338.54,14562849.0,339.0084,1641186000000,206204,,FB,2022-01-02 23:00:00
1,339.950,343.0854,331.8711,336.53,15997974.0,336.9083,1641272400000,272884,,FB,2022-01-03 23:00:00
2,333.020,335.7600,323.8400,324.17,20564521.0,328.6425,1641358800000,324209,,FB,2022-01-04 23:00:00
3,322.820,339.1650,322.7200,332.46,27962809.0,333.3542,1641445200000,394956,,FB,2022-01-05 23:00:00
4,332.740,337.0000,328.8801,331.79,14722020.0,332.8194,1641531600000,219738,,FB,2022-01-06 23:00:00
...,...,...,...,...,...,...,...,...,...,...,...
382,236.930,240.0800,235.6100,239.51,1942173.0,238.4738,1689220800000,39958,,CDNS,2023-07-12 23:00:00
383,239.420,244.4500,239.4200,240.74,1304065.0,241.4957,1689307200000,30059,,CDNS,2023-07-13 23:00:00
384,240.270,245.2400,239.7100,244.44,1384196.0,243.3507,1689566400000,34096,,CDNS,2023-07-16 23:00:00
385,244.440,244.9600,239.0600,244.09,1732136.0,243.0181,1689652800000,36858,,CDNS,2023-07-17 23:00:00


In [38]:
#Merge two files together ()
files = ["stocks.parquet", "stocks1.parquet"]

with pq.ParquetWriter("output.parquet", schema=pq.ParquetFile(files[0]).schema_arrow) as writer:
    for file in files:
        writer.write_table(pq.read_table(file))

In [None]:
#Load to GCP bucket
path_to_private_key = '/Users/nathanmckinney/Desktop/Github/secret/keyfile.json'
client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)

# The bucket on GCS in which to write the CSV file
bucket = client.bucket('test_bucket_nlm')
# The name assigned to the CSV file on GCS
blob = bucket.blob('output.parquet')

with open('output.parquet', 'rb') as parquet_file:
    blob.upload_from_file(parquet_file)