<img src="./files/refinitiv.png" width="20%" style="vertical-align: top;">\

---- 
This source code is provided under the Apache 2.0 license\
and is provided AS IS with no warranty or guarantee of fit for purpose.\
See the project's LICENSE.md for details.\
Copyright (C) 2022 Refinitiv. All rights reserved.
---- 
# Downloading a Large Tick History File- Divide and Conquer
### Refinitiv Data Platform (RDP)
### Client File Store (CFS)
### Tick History (TH)
### Python Notebook
----

### Import Required Modules

In [6]:
import requests, json, time, getopt, sys
import pandas as pd
from ipynb.fs.full.AuthTokenHandling import getToken

### Authernticate - use getToken from AuthTokenHandling

In [7]:
accessToken = getToken(); print("Have token now");

Reading the token from: token.txt
Have token now


### Set Application Constants

In [8]:
# Set Application Constants
RDP_AUTH_VERSION = "/v1"
RDP_CFS_VERSION = "/v1"
RDP_BASE_URL = "https://api.refinitiv.com"
CATEGORY_URL = "/auth/oauth2"
ENDPOINT_URL = "/token"
CLIENT_SECRET = ""
TOKEN_FILE = "token.txt"
SCOPE = "trapi"

### Request File Sets - Define a Helper Function

In [4]:
#FILESET_ENDPOINT = RDP_BASE_URL+'/file-store'+RDP_CFS_VERSION + '/file-sets?bucket='+ RDP_CFS_BUCKET
FILESET_ID = ''

def requestFileSets(token, withNext, skipToken, bucket, attributes):   
    global FILESET_ENDPOINT    
    print("Obtaining FileSets in "+bucket+" Bucket...")
  
    FILESET_ENDPOINT = RDP_BASE_URL+'/file-store'+RDP_CFS_VERSION + '/file-sets?bucket='+ bucket
    
    querystring = {}
    payload = ""
    jsonfull = ""
    jsonpartial = ""
    
    headers = {
            'Content-Type': "application/json",
            'Authorization': "Bearer " + token,
            'cache-control': "no-cache"
    }

    if attributes:
        FILESET_ENDPOINT = FILESET_ENDPOINT + attributes
    if withNext:
        FILESET_ENDPOINT = FILESET_ENDPOINT + '&skipToken=' +skipToken
        
    response = requests.request("GET", FILESET_ENDPOINT, data=payload, headers=headers, params=querystring)
    
    if response.status_code != 200:
        if response.status_code == 401:   # error when token expired
                accessToken = getToken();     # token refresh on token expired
                headers['Authorization'] = "Bearer " + accessToken
                response = requests.request("GET", FILESET_ENDPOINT, data=payload, headers=headers, params=querystring)
         
    print('Raw response=');
    print(response);
    
    if response.status_code == 200:
        jsonFullResp = json.loads(response.text)
        return jsonFullResp; 
    else:
        return '';

#jsonFullResp = requestFileSets(accessToken, False, '',RDP_CFS_BUCKET,'');

### Request FileSets Containing Large Tick History File -
The file we are going to use as example is FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1

We request Tick History File Sets for a venue FRA and limit by dates 2020.02.25 to 2020.02.29, and the first FileSet should contain the file ID of the file that we require.

In [9]:
RDP_TH_BUCKET = "TICKHISTORY_VBD_UNLIMITED"
CFS_VBD_VENUE = "FRA"
CFS_VBD_VIEW = "normalised"

jsonFullResp = requestFileSets(accessToken, False, '',RDP_TH_BUCKET,'&attributes=venue:'+CFS_VBD_VENUE+',view:'+CFS_VBD_VIEW+
                               '&contentFrom=2020-02-25T00:00:00Z&contentTo=2020-02-29T00:00:00Z');
#print('Parsed json response=');
#print(json.dumps(jsonFullResp, indent=2));
print('Same response, tabular view');
df = pd.json_normalize(jsonFullResp['value'])
df

Obtaining FileSets in TICKHISTORY_VBD_UNLIMITED Bucket...
Reading the token from: token.txt
Token expired, refreshing a new one...
Saving the new token
Token is: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJSUzI1NiIsImtpZCI6ImRMdFd2Q0tCSC1NclVyWm9YMXFod2pZQ2t1eDV0V2ZSS2o4ME9vcjdUY28ifQ.eyJkYXRhIjoie1wiY2lwaGVydGV4dFwiOlwiNnNsMkNsV0dJR214RVVvUFpLRnYxUlVRSUcyamtCUlFGcVo4RUFCb2JfMFc1SjdmZDFCNk9Wd19pUTlhb1ZjYVJ5SENJcnpHcTAxWlZWaEhzbzRIWDZpbjlrRG9RQ1lxdm12VmluYUo5UHVLQVJoS0hNbU0tSnltM1RWTGkwSDh5eGhMTmRwN2hzN2xDa3NxUmV2bUppaVNqdndjLTZyTVd0Uy1zQUxDSXNMNHJaaVFvOE9JajBJQVRYYmI0LVVCX3FYNGdXcWpLeXpUSm15b0pfeDFkWF95QUlSblJ3bmFPendDSzlXVTluSUpqc0hhbnJSSkdIZFhHRmxkalloNDZDemlmMlhUVXFVVk0zQWVyWnNvTEJGeXRjZnJlOGw4SlpIcU9iMk8wWW9UTmI0cmNEUGNMMEJaM29oblZNZGwyQUViNjlfMXJKbFItcjA4NHNvMy04S0g1cl9qVXh4Qy0xeDNWa3BSZFRxVHBVV2VSNTRMbVdQdnh2Yk5jMWs0b2paa2N4TWxYdk9uQWhDb0VieTloSy1FSEZxLTdsYWFSaUtqR3BCclhvZTlGUE5kbnYyeFQtbFQySGNKVF9RVFZrVkREZV9vWkdYel9lRFhvZ3QyRDdpU1gtRWtTYzJiZTNORTFmQlF3cHh0bnRaVG1DczE1OTNHS2RFV1JJdWlPLXU2NHl

Unnamed: 0,id,name,bucketName,packageId,attributes,files,numFiles,contentFrom,contentTo,availableFrom,availableTo,status,created,modified
0,4962-f3ab-173a79f7-badb-82c78ec95267,FRA_normalised_2020-02-25,TICKHISTORY_VBD_UNLIMITED,417a-cb85-f8d1a8c3-93f4-6dee980c2a20,"[{'name': 'message_id', 'value': '410e8a2a-41a...","[4b8a-b6f9-d7259333-8b3f-635a3c3d030a, 4dd1-36...",2,2020-02-25T00:00:00Z,2020-02-26T00:00:00Z,2022-07-08T15:03:40Z,2038-01-19T03:14:07Z,READY,2022-07-08T15:03:42Z,2022-07-08T15:03:42Z
1,4cd8-faa5-b2efdb34-b95c-fea2284b9ee0,FRA_normalised_2020-02-26,TICKHISTORY_VBD_UNLIMITED,417a-cb85-f8d1a8c3-93f4-6dee980c2a20,"[{'name': 'size', 'value': '42720000731'}, {'n...","[4388-5520-fe0f2669-87cd-9dc91b4b5dff, 4ee0-c0...",2,2020-02-26T00:00:00Z,2020-02-27T00:00:00Z,2022-07-08T15:28:46Z,2038-01-19T03:14:07Z,READY,2022-07-08T15:28:48Z,2022-07-08T15:28:48Z
2,4dbc-0c8a-83b6c992-9f32-824aa0a777f0,FRA_normalised_2020-02-27,TICKHISTORY_VBD_UNLIMITED,417a-cb85-f8d1a8c3-93f4-6dee980c2a20,"[{'name': 'view', 'value': 'normalised'}, {'na...","[4760-42d6-3138bd2c-adc3-3cb0f37eb308, 48a9-6a...",2,2020-02-27T00:00:00Z,2020-02-28T00:00:00Z,2022-07-08T15:27:35Z,2038-01-19T03:14:07Z,READY,2022-07-08T15:27:36Z,2022-07-08T15:27:37Z


### Select File Id

We are going to select the firts File ID in the File Set, to use 2020.02.25 FRA normalized as an example of a large file.  Another large file can be handled analogously.

In [13]:
FILE_ID = df.iloc[0]['files'][0]
#FILE_ID = '4b8a-b6f9-d7259333-8b3f-635a3c3d030a'
print('FILE_ID selected is: ' + FILE_ID)
#FILE_NAME = 'FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz'
#print('FILE_NAME selected is: ' + FILE_NAME)

FILE_ID selected is: 4b8a-b6f9-d7259333-8b3f-635a3c3d030a


### Get TH File Details by File ID - Define a Helper Function

In [14]:
FILES_ENDPOINT_START = RDP_BASE_URL+'/file-store'+RDP_CFS_VERSION + '/files/'
 
def requestFileDetails(token, fileId):   

    print("Obtaining File details for File= "+ fileId + " ...")
    print("(If result is Response=400, make sure that fileId is set with a valid value...)")
    FILES_ENDPOINT = FILES_ENDPOINT_START + fileId
  
    querystring = {}
    payload = ""
    jsonfull = ""
    jsonpartial = ""
    
    headers = {
            'Content-Type': "application/json",
            'Authorization': "Bearer " + token,
            'cache-control': "no-cache"
    }
        
    response = requests.request("GET", FILES_ENDPOINT, data=payload, headers=headers, params=querystring)
    
    if response.status_code != 200:
        if response.status_code == 401:   # error when token expired
                accessToken = getToken();     # token refresh on token expired
                headers['Authorization'] = "Bearer " + accessToken
                response = requests.request("GET", FILES_ENDPOINT, data=payload, headers=headers, params=querystring)
         
    print('Raw response=');
    print(response);
    
    if response.status_code == 200:
        jsonFullResp = json.loads(response.text)        
        return jsonFullResp; 
    else:
        return '';


### Request File Details of a Large File
* We verify the size of the file to be retrieved
* Store its file name and file size

In [15]:
jsonFullResp = requestFileDetails(accessToken, FILE_ID);

print('Parsed json response=');
print(json.dumps(jsonFullResp, indent=2));
FILE_NAME = jsonFullResp['filename'];
print('FILE_NAME selected is: ' + FILE_NAME)
FILE_SIZE_BYTES = jsonFullResp['fileSizeInBytes'];
print('FILE_SIZE_BYTES selected is: ' + str(FILE_SIZE_BYTES))

Obtaining File details for File= 4b8a-b6f9-d7259333-8b3f-635a3c3d030a ...
(If result is Response=400, make sure that fileId is set with a valid value...)
Reading the token from: token.txt
Raw response=
<Response [200]>
Parsed json response=
{
  "id": "4b8a-b6f9-d7259333-8b3f-635a3c3d030a",
  "filename": "FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz",
  "filesetId": "4962-f3ab-173a79f7-badb-82c78ec95267",
  "fileType": "File",
  "description": "Merged data file",
  "storageLocation": {
    "url": "https://s3-us-east-1.amazonaws.com/a205143-use1-prod-results-vbd/normalised/FRA/2020-02-25/data/merged/FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz",
    "rolearn": "arn:aws:iam::259431915815:role/a205143-prod-push-mechanism-EdsCfsS3Access",
    "@type": "s3"
  },
  "created": "2022-07-08T15:03:42Z",
  "modified": "2022-07-08T15:03:42Z",
  "href": "https://api.refinitiv.com/file-store/v1/files/4b8a-b6f9-d7259333-8b3f-635a3c3d030a/stream",
  "fileSizeInBytes": 41152756058,
  "md5": "6394dc5

### Get File Location (Step 1 of 2)

In [16]:
import shutil

FILES_STREAM_ENDPOINT_START = RDP_BASE_URL+'/file-store'+RDP_CFS_VERSION + '/files/'
DIRECT_URL = ''

def requestFileLocation(token, fileId):   
    
    FILES_STREAM_ENDPOINT = FILES_STREAM_ENDPOINT_START + fileId+ '/stream?doNotRedirect=true'
    
    print("Obtaining File ... " + FILES_STREAM_ENDPOINT)
  
    chunk_size = 1000
    
    headers = {
            'Authorization': 'Bearer ' + token,
            'cache-control': "no-cache",
            'Accept': '*/*'
    }
        
    response = requests.request("GET", FILES_STREAM_ENDPOINT, headers=headers, stream=False, allow_redirects=False)
    
    if response.status_code != 200:
        if response.status_code == 401:   # error when token expired
                accessToken = getToken();     # token refresh on token expired
                headers['Authorization'] = "Bearer " + accessToken
                response = requests.request("GET",FILES_STREAM_ENDPOINT, headers=headers, stream=False, allow_redirects=False)

         
    print('Response code=' + str(response.status_code));
    
    if response.status_code == 200:
        jsonFullResp = json.loads(response.text)
   #     print('Parsed json response=');
   #     print(json.dumps(jsonFullResp, indent=2));
        DIRECT_URL = jsonFullResp['url'];
        print('File Direct URL is: '  +str(DIRECT_URL));
        
    return DIRECT_URL; 


DIRECT_URL = requestFileLocation(accessToken, FILE_ID);

Obtaining File ... https://api.refinitiv.com/file-store/v1/files/4b8a-b6f9-d7259333-8b3f-635a3c3d030a/stream?doNotRedirect=true
Reading the token from: token.txt
Response code=200
File Direct URL is: https://s3.amazonaws.com/a205143-use1-prod-results-vbd/normalised/FRA/2020-02-25/data/merged/FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz?x-request-Id=37c1fcd7-136f-4420-9d1c-35225e80b984&x-package-id=417a-cb85-f8d1a8c3-93f4-6dee980c2a20&x-client-app-id=GE-A-01103867-3-603&x-file-name=FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz&x-fileset-id=4962-f3ab-173a79f7-badb-82c78ec95267&x-bucket-name=TICKHISTORY_VBD_UNLIMITED&x-uuid=GENTC-25929&x-file-Id=4b8a-b6f9-d7259333-8b3f-635a3c3d030a&x-fileset-name=FRA_normalised_2020-02-25&x-event-external-name=cfs-file-download&X-Amz-Security-Token=FwoGZXIvYXdzEL%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDJcRjtS5U0couMPMSiKwAfbg3kWXHB9c42VPcFyYP59FshvHH3jSbLYBmHaOjgkQRR4CMX1TQ2cO2Tj3NtLB%2FXHlaEDaKBRoaO21shB15DmHT45fhnTy8KlYO3akvRLxMQEsyZMnp%2BplKS4dgjH5qCZpq

### Download File From File Location in Chunks (Step 2 of 2)

In [11]:
from urllib.parse import urlparse, parse_qs

def requestDirectFileDownload(token, fileURL, fileName,fileSizeBytes, chunkSize):   

    print("Obtaining File from URL... " + fileURL + '... to file name=' + fileName+ ', it\'s size is '+ str(fileSizeBytes))
    
    headers_ = {
            'Accept': '*/*',
    }
    
    #Parse out URL parameters for submission into requests
    url_obj = urlparse(fileURL)
    parsed_params = parse_qs(url_obj.query)
    # extract the URL without query parameters
    parsed_url = url_obj._replace(query=None).geturl()

    for i in range(0, fileSizeBytes, chunkSize):
        rangeEnd = i+chunkSize -1
        if rangeEnd > (fileSizeBytes -1):
            rangeEnd = fileSizeBytes -1
        rangeExpression = 'bytes='+str(i)+'-'+str(rangeEnd)
        print('Processing rangeExpression='+rangeExpression)
        headers_['Range'] = rangeExpression
        
        # ignore verifying the SSL certificate as example only 'verify=False'
        # otherwise add security certificate
        response = requests.get(parsed_url, headers = headers_, params=parsed_params, stream=True) #, verify=False)
          
        print('Response code=' + str(response.status_code)+ ' text='+ str(response.reason));
        
        if response.status_code != 206:
            if response.status_code == 401:   # error when token expired
                    accessToken = getToken();     # token refresh on token expired
                    headers_['Authorization'] = "Bearer " + accessToken
                    response = requests.get(parsed_url, params=parsed_params, headers=headers_,  stream=True) #, verify=False) 
            elif response.status_code == 400:   #bad request
                    break;          
  
        filename = 'downloaded_'+fileName  
    
        if response.status_code == 206:
            print('Processing into file '+str('downloaded_'+fileName)+' ...')
            with open(filename, 'ab') as fd:
                shutil.copyfileobj(response.raw, fd) 
                
    print('Look for gzipped file named: '+ 'downloaded_'+filename + ' in current directory')       
        
    response.connection.close()
        
    return; 

requestDirectFileDownload(accessToken, DIRECT_URL, FILE_NAME, FILE_SIZE_BYTES, 3000000000);

Obtaining File from URL... https://s3.amazonaws.com/a205143-use1-prod-results-vbd/normalised/FRA/2020-02-25/data/merged/FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz?x-request-Id=87ad7187-5b1a-4ef7-ac23-652c79624eeb&x-package-id=417a-cb85-f8d1a8c3-93f4-6dee980c2a20&x-client-app-id=GE-A-01103867-3-603&x-file-name=FRA-2020-02-25-NORMALIZEDMP-Data-1-of-1.csv.gz&x-fileset-id=4962-f3ab-173a79f7-badb-82c78ec95267&x-bucket-name=TICKHISTORY_VBD_UNLIMITED&x-uuid=GENTC-25929&x-file-Id=4b8a-b6f9-d7259333-8b3f-635a3c3d030a&x-fileset-name=FRA_normalised_2020-02-25&x-event-external-name=cfs-file-download&X-Amz-Security-Token=FwoGZXIvYXdzEKb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDHKmWgn5Hffw9xVzCCKwAaiu1Egc4a7YknQviOxMn2y6taMNwtNWrXp5CAr25KJLSWvR2Jm4J%2B2h4p3QUaE%2Bh7y%2F1FFsc7mVMETiGdWnBYEuhFFWHM2yTk8XTC9rn5Wv3gqrULiQ8gZe07jLsuFlcOVW%2BPgN3PghkaX8I7Cfs9ovbsniwlWS9eRez0BsAcWBGbazuqyzwPpzksUhmkc%2BRdLaLauMgSOk47IxGAmIHTQiRxjarNpiZWa2%2BHzJ8UW8KJ3MhpcGMi33iCdKH4c%2BTF15tw5lJiDvcfDuxpuLla3VklRIjOZxhscEcUoh4b