# Let's gather some Raw Data from the Source 

# Import the necessary libraries

In [1]:
import pandas as pd

import requests

from datetime import datetime, timedelta, timezone
import time
import ntplib

import sys
sys.path.append("../scripts/")
import path

# Getting Curent Date

In [2]:
product_id = "BTC-USD"

#.replace(hour=0, minute=0, second=0) <- Used to make the Time from yesterday at 23, to gather all Data until Yesterday INCLUDED.

#On Server with working clock
#yesterday = (datetime.now() - timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime("%Y-%m-%d %H:%M:%S")

#Getting Actual Data from the Internet for Server with messy clock
try:
    client = ntplib.NTPClient()
    response = client.request('pool.ntp.org')
    yesterday = (datetime.fromtimestamp(response.tx_time, tz=timezone.utc) - timedelta(days=1)).replace(hour=23, minute=0, second=0).strftime("%Y-%m-%d %H:%M:%S")
    print(yesterday)

except:
    print("Could not sync with time server.")

2024-01-10 23:00:00


# Fetching Hourly historical Data from 2 years ago to yesterday (fetch as much as you want)

# INFO about the WebAPI EndPoint

- Historical rate data may be incomplete.
- No data is published for intervals where there are no ticks. 
- Historical rates should not be polled frequently. 
- If you need real-time information, use the trade and book endpoints along with the WebSocket feed.

In [3]:
#We can't fetch from the websocket more than a total combined 300 Candles.
#Granularity is in seconds, so for example, no more than 300 hours.
#Data is returned as a List of Lists, where each list is a Candle, with the Format [Time, Open, High, Low, Close, Volume]
#Values are Returned in DESCENDING Order, Last Record is First in the List.
#Time is in UTC format.

In [4]:
RawTempList = []

In [5]:
#Setting the Start Hour at Midnight and double checking on EndDate
startdate = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S") - timedelta(weeks=108)).replace(hour=0, minute=0, second=0)
enddate = datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S").replace(hour=23, minute=0, second=0)

while startdate < enddate:    
    tempfetchenddate = startdate + timedelta(hours=299) #Let's Keep a Reserve of 2 Candles since it goes from start to end included
    
    if tempfetchenddate >= enddate:
        tempfetchenddate = enddate
    
    #Fetch
    
    URL = f'https://api.exchange.coinbase.com/products/{product_id}/candles?start={startdate}&end={tempfetchenddate}&granularity=3600'
    r = requests.get(URL)
    data = r.json()
    
    RawTempList.extend(data)
    
    startdate = tempfetchenddate  

In [6]:
RawData = pd.DataFrame(RawTempList, columns = ["Date", "Open", "High", "Low", "Close", "Volume"])

#RawTempList

In [7]:
#Ensuring UTC Awareness by setting UTC Timezone

RawData["Date"] = RawData["Date"].apply(lambda x: datetime.fromtimestamp(x, tz= timezone.utc))

In [8]:
RawData = RawData.sort_values(by=["Date"])

RawData

Unnamed: 0,Date,Open,High,Low,Close,Volume
299,2021-12-15 00:00:00+00:00,48080.23,48795.00,48359.23,48150.08,843.040672
298,2021-12-15 01:00:00+00:00,47973.15,48189.99,48150.08,48087.19,385.411999
297,2021-12-15 02:00:00+00:00,47847.34,48110.89,48087.18,48010.17,343.643550
296,2021-12-15 03:00:00+00:00,47898.93,48312.39,48009.53,48287.73,408.427606
295,2021-12-15 04:00:00+00:00,48096.95,48449.35,48286.34,48259.78,320.162218
...,...,...,...,...,...,...
18000,2024-01-10 19:00:00+00:00,46096.54,46755.73,46554.77,46518.14,1766.961272
17999,2024-01-10 20:00:00+00:00,44813.00,46538.97,46514.45,46144.39,5815.951710
17998,2024-01-10 21:00:00+00:00,45129.54,46343.62,46144.91,45946.36,4362.896827
17997,2024-01-10 22:00:00+00:00,45765.68,46976.99,45954.81,46928.33,2569.365343


# Dump Raw Data to Disk

In [9]:
start = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S") - timedelta(weeks=108)).replace(hour=0, minute=0, second=0)
finish = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S")).replace(hour=23, minute=0, second=0)

RawData.to_parquet(path.RAW_DATA_DIR / f'{product_id}_HourlyRawData_From{start}_To{yesterday}.parquet')