# Let's gather some Raw Data from the Source 

# Import the necessary libraries

In [1]:
import pandas as pd

import requests

from datetime import datetime, timedelta, timezone
import time
import ntplib

import sys
sys.path.append("../scripts/")
import path

# Getting Curent Date

In [2]:
product_id = "BTC-USD"

#.replace(hour=0, minute=0, second=0) <- Used to make the Time from yesterday at 23, to gather all Data until Yesterday INCLUDED.

#On Server with working clock
#yesterday = (datetime.now() - timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime("%Y-%m-%d %H:%M:%S")

#Getting Actual Data from the Internet for Server with messy clock
try:
    client = ntplib.NTPClient()
    response = client.request('pool.ntp.org')
    yesterday = (datetime.fromtimestamp(response.tx_time, tz=timezone.utc) - timedelta(days=1)).replace(hour=23, minute=0, second=0).strftime("%Y-%m-%d %H:%M:%S")
    print(yesterday)

except:
    print("Could not sync with time server.")

2024-01-09 23:00:00


# Fetching Hourly historical Data from 2 years ago to yesterday (fetch as much as you want)

# INFO about the WebAPI EndPoint

- Historical rate data may be incomplete.
- No data is published for intervals where there are no ticks. 
- Historical rates should not be polled frequently. 
- If you need real-time information, use the trade and book endpoints along with the WebSocket feed.

In [3]:
#We can't fetch from the websocket more than a total combined 300 Candles.
#Granularity is in seconds, so for example, no more than 300 hours.
#Data is returned as a List of Lists, where each list is a Candle, with the Format [Time, Open, High, Low, Close, Volume]
#Values are Returned in DESCENDING Order, Last Record is First in the List.
#Time is in UTC format.

In [4]:
RawTempList = []

In [5]:
#Setting the Start Hour at Midnight and double checking on EndDate
startdate = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S") - timedelta(weeks=108)).replace(hour=0, minute=0, second=0)
enddate = datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S").replace(hour=23, minute=0, second=0)

while startdate < enddate:    
    tempfetchenddate = startdate + timedelta(hours=299) #Let's Keep a Reserve of 2 Candles since it goes from start to end included
    
    if tempfetchenddate >= enddate:
        tempfetchenddate = enddate
    
    #Fetch
    
    URL = f'https://api.exchange.coinbase.com/products/{product_id}/candles?start={startdate}&end={tempfetchenddate}&granularity=3600'
    r = requests.get(URL)
    data = r.json()
    
    RawTempList.extend(data)
    
    startdate = tempfetchenddate  

In [6]:
RawData = pd.DataFrame(RawTempList, columns = ["Date", "Open", "High", "Low", "Close", "Volume"])

#RawTempList

In [7]:
#Ensuring UTC Awareness by setting UTC Timezone

RawData["Date"] = RawData["Date"].apply(lambda x: datetime.fromtimestamp(x, tz= timezone.utc))

In [8]:
RawData = RawData.sort_values(by=["Date"])

RawData

Unnamed: 0,Date,Open,High,Low,Close,Volume
299,2021-12-14 00:00:00+00:00,46300.00,47243.18,46727.89,47022.75,870.201420
298,2021-12-14 01:00:00+00:00,46607.90,47128.52,47018.78,46889.47,559.425978
297,2021-12-14 02:00:00+00:00,46700.26,47081.39,46899.17,47052.39,466.226036
296,2021-12-14 03:00:00+00:00,46782.86,47130.37,47052.39,46977.81,328.500181
295,2021-12-14 04:00:00+00:00,46890.12,47207.92,46977.80,47017.01,519.013109
...,...,...,...,...,...,...
18000,2024-01-09 19:00:00+00:00,46589.25,46903.46,46658.71,46890.10,653.207706
17999,2024-01-09 20:00:00+00:00,46593.78,46940.00,46888.27,46651.31,1086.527743
17998,2024-01-09 21:00:00+00:00,44701.54,47901.00,46649.39,45419.45,8919.956620
17997,2024-01-09 22:00:00+00:00,45275.03,46286.39,45420.10,46278.06,1915.366592


# Dump Raw Data to Disk

In [None]:
start = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S") - timedelta(weeks=108)).replace(hour=0, minute=0, second=0)
finish = (datetime.strptime(yesterday, "%Y-%m-%d %H:%M:%S")).replace(hour=23, minute=0, second=0)

RawData.to_parquet(path.RAW_DATA_DIR / f'{product_id}_HourlyRawData_From{start}_To{yesterday}.parquet')