### Read Credentials 

In [11]:
import requests, json, time, getopt, sys

# User Variables
credFile = open("..\creds\credFile.txt","r")    # one per line
                                                #--- RDP MACHINE ID---
                                                #--- LONG PASSWORD---
                                                #--- GENERATED CLIENT ID---

USERNAME = credFile.readline().rstrip('\n')
PASSWORD = credFile.readline().rstrip('\n')
CLIENT_ID = credFile.readline().rstrip('\n')

credFile.close()

# Make sure that creds are red in
#print("USERNAME="+str(USERNAME))
#print("PASSWORD="+str(PASSWORD))
#print("CLIENT_ID="+str(CLIENT_ID))

### Define Token Endpoint 

In [12]:
# Application Constants
RDP_version = "/v1"
base_URL = "https://api.refinitiv.com"
category_URL = "/auth/oauth2"
endpoint_URL = "/token"
CLIENT_SECRET = ""
TOKEN_FILE = "token.txt"
SCOPE = "trapi"

TOKEN_ENDPOINT = base_URL + category_URL + RDP_version + endpoint_URL

In [13]:
def _requestNewToken(refreshToken):
    if refreshToken is None:
        tData = {
            "username": USERNAME,
            "password": PASSWORD,
            "grant_type": "password",
            "scope": SCOPE,
            "takeExclusiveSignOnControl": "true"
        };
    else:
        tData = {
            "refresh_token": refreshToken,
            "grant_type": "refresh_token",
        };

    # Make a REST call to get latest access token
    response = requests.post(
        TOKEN_ENDPOINT,
        headers = {
            "Accept": "application/json"
        },
        data = tData,
        auth = (
            CLIENT_ID,
            CLIENT_SECRET
        )
    )
    
    if response.status_code != 200:
        raise Exception("Failed to get access token {0} - {1}".format(response.status_code, response.text));

    # Return the new token
    return json.loads(response.text);

In [14]:
def saveToken(tknObject):
    tf = open(TOKEN_FILE, "w+");
    print("Saving the new token");
    # Append the expiry time to token
    tknObject["expiry_tm"] = time.time() + int(tknObject["expires_in"]) - 10;
    # Store it in the file
    json.dump(tknObject, tf, indent=4)


In [15]:
def getToken():
    try:
        print("Reading the token from: " + TOKEN_FILE);
        # Read the token from a file
        tf = open(TOKEN_FILE, "r+")
        tknObject = json.load(tf);

        # Is access token valid
        if tknObject["expiry_tm"] > time.time():
            # return access token
            return tknObject["access_token"];

        print("Token expired, refreshing a new one...");
        tf.close();
        # Get a new token from refresh token
        tknObject = _requestNewToken(tknObject["refresh_token"]);

    except Exception as exp:
        print("Caught exception: " + str(exp))
        print("Getting a new token using Password Grant...");
        tknObject = _requestNewToken(None);

    # Persist this token for future queries
    saveToken(tknObject)
    print("Token is: " + tknObject["access_token"])
    # Return access token
    return tknObject["access_token"];

### Obtain Valid Token 

In [16]:
accessToken = getToken();
print("Have token now");

Reading the token from: token.txt
Token expired, refreshing a new one...
Saving the new token
Token is: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJSUzI1NiIsImtpZCI6IkhQS2pEYUhiZmpLOVRfRHUxeGFJX1JaVmZfRjNDcHlEc0dTcDV5NWxCYVEifQ.eyJkYXRhIjoie1wiY2lwaGVydGV4dFwiOlwiYlpMYS1naXZEUEsxQ3dmcndWdmpfVVBvM1VvOGwxalJqRXF3WUliU3B3TVBZaTVUNHdqWFY5VDk1VS1qbWF5SFAzV2ZMaGRFeHc5WkZEUWdSckl1Sm5LSGVIYzJmSXJNNXp3U09iWHFxWk8wZElKNmdFdzBDZEVFQ2NmQ3MwOHVuTE9ETzYtM0dpNHNwWlotRW9jTjh4Sk96REc3N0RTaFBOUGpORWlhN2t5Q2VXR0dWMGV6ZmtKU2FuVVVxQTVCR0F6WWdpZFZzMVVUckVkVkYtRklnQ1ZTeEhPVkxPX0hPSmNSX2FNWHhYRVVhZmN5OFBPelRWWGYtY2tSMGl0bE41Z1h4T0JCSzN0cUFNNFFwWjc0aGFPbkQ1Z1FvMEtkQm0zX2ZDU0lxc1ZacXRfQ0FGN0NIeHhQSHRPRnhJeG81WVY5MUZ1SGVtSnNrdGpiemNsNTRHNURjNDg0Y3NRTHItcDNoa3hpbFBESkRmbVQ4d2wtX2NLZ3pvOVlIOVRxa0RXMGtyQlJsV3B2TjIzeFg3bzMyQjEzT0ZZRG5EZXI5V0ZpZXlYeXJQZlpIZ3NYVmViV2ZfUWxKOUpDRWcwa2hPd2tVb2taU0RkdXMwR2cwZWVuUi1VTHRvY0lRU0xDVU5XNHotV2NiLVlfVThOQ1NDckxZaFBreHhIdW9jZnVLX3F3WkthNWlBX29QN0dOS0l5a0VnUXIwX1FDNHhRbFZYYmZHME5DTlBLQUdDa21GS

### Request News Metadata 

In [17]:
news_category_URL = "/data/news"
newsmeta_endpoint_URL = "/metadata"
news_param1 = "?limit=100"
NEWS_ENDPOINT = base_URL + news_category_URL + RDP_version + newsmeta_endpoint_URL 
NEWS_META_FILE = "newsMetadata.txt"

nodesWithParents = []
nodesWithoutParents = []

#print("NEWS_ENDPOINT=" + NEWS_ENDPOINT)

dResp = requests.get(NEWS_ENDPOINT + news_param1 , headers = {"Authorization": "Bearer " + accessToken});

if dResp.status_code != 200:
    print("Unable to get data. Code %s, Message: %s" % (dResp.status_code, dResp.text));
else:
    print("Resource access successful")
    # Display data
    jResp = json.loads(dResp.text);
#    print(json.dumps(jResp, indent=2));


Resource access successful


### Request Children and Re-Categorize With Parent Information

In [19]:
def processWithChildren(dResp, jResp, parentId):
    news_param2 = "/children?limit=3000"  
    global accessToken
    
    if dResp.status_code == 200:
        for node in jResp['data']: 
            nodeIsFirstSeen = True
            if parentId != '':
                node['parentId'] = parentId 
                if node not in nodesWithParents:
                    nodesWithParents.append(node)
 #                   print("*** id= " + str(node.get('id')) + "nodesWithParents.append" )
                else :
                    nodeIsFirstSeen = False
            else:
                if not any(nd.get('id') == node.get('id') for nd in nodesWithParents) and node not in nodesWithoutParents:
                    nodesWithoutParents.append(node)
  #                  print("*** id= " + str(node.get('id')) + "nodesWithoutParents.append")
                else :
                    nodeIsFirstSeen = False
            # keep track of the processing progress
            if nodeIsFirstSeen == True and ((len(nodesWithParents) + len(nodesWithoutParents)) % 200) == 0:
                print("***************Inserted "+ str((len(nodesWithParents) + len(nodesWithoutParents))))
            if nodeIsFirstSeen == True and node.get('childrenCount') != 0:
                dChildrenResp = requests.get(NEWS_ENDPOINT + "/" + str(node.get('id')) + news_param2, headers = {"Authorization": "Bearer " + accessToken});

                if dChildrenResp.status_code != 200:
                    print("Unable to get children data. Code %s, Message: %s" % (dChildrenResp.status_code, dChildrenResp.text));
                    if dChildrenResp.status_code != 401:   # error other then token expired
                        break 
                    accessToken = getToken();     # token refresh on token expired
                    dChildrenResp = requests.get(NEWS_ENDPOINT + "/" + str(node.get('id')) + news_param2, headers = {"Authorization": "Bearer " + accessToken});
                else:                
                    jCResp = json.loads(dChildrenResp.text);
                    processWithChildren(dChildrenResp, jCResp, node.get('id'))


In [20]:
processWithChildren(dResp, jResp,'')

***************Inserted 200
***************Inserted 400
***************Inserted 600
***************Inserted 800


### Request Next on News Metadata and (optionally) Save to File

In [21]:
#DBG nf = open(NEWS_META_FILE, "w+");
#DBG nf.write(json.dumps(jResp, indent=2))
    
#print("Next= " + jResp["meta"]["next"])

news_param2 = "?cursor=" 
while jResp["meta"]["next"]:   #not empty
    print("Next= " + jResp["meta"]["next"])
    dResp = requests.get(NEWS_ENDPOINT + news_param2 + jResp["meta"]["next"] , headers = {"Authorization": "Bearer " + accessToken});

    if dResp.status_code != 200:   #
        print("Unable to get data. Code %s, Message: %s" % (dResp.status_code, dResp.text));
        if dResp.status_code != 401:   # error other then token expired
            break 
        accessToken = getToken();     # token refresh on token expired
        dResp = requests.get(NEWS_ENDPOINT + news_param2 + jResp["meta"]["next"] , headers = {"Authorization": "Bearer " + accessToken});
            
    print("Resource access successful")
    # Display data
    jResp = json.loads(dResp.text);
#    print(json.dumps(jResp, indent=2));
    processWithChildren(dResp, jResp,'')
        
#DBG    nf.write(json.dumps(jResp, indent=2))
#DBG nf.close()

print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<DONE child processing >>>>>>>>>>>>>>>>>>>>>>>>>>")

Next= eyJsaW1pdCI6MTAwLCJmb3J3YXJkIjp0cnVlLCJwYWdpbmF0aW9uSWQiOiIwMDAwMDAwMDk5In0=
Resource access successful
***************Inserted 1000
***************Inserted 1200
***************Inserted 1400
***************Inserted 1600
***************Inserted 1800
***************Inserted 2000
***************Inserted 2200
Next= eyJsaW1pdCI6MTAwLCJmb3J3YXJkIjp0cnVlLCJwYWdpbmF0aW9uSWQiOiIwMDAwMDAwMTk5In0=
Resource access successful
Next= eyJsaW1pdCI6MTAwLCJmb3J3YXJkIjp0cnVlLCJwYWdpbmF0aW9uSWQiOiIwMDAwMDAwMjk5In0=
Resource access successful
Next= eyJsaW1pdCI6MTAwLCJmb3J3YXJkIjp0cnVlLCJwYWdpbmF0aW9uSWQiOiIwMDAwMDAwMzk5In0=
Resource access successful
***************Inserted 2400
***************Inserted 2600
***************Inserted 2800
***************Inserted 3000
***************Inserted 3200
***************Inserted 3400
***************Inserted 3600
***************Inserted 3800
***************Inserted 4000
***************Inserted 4200
***************Inserted 4400
Next= eyJsaW1pdCI6MTAwLCJmb3J3YXJkIjp0

### Process into Tree Form

In [22]:
from anytree import Node, RenderTree

# keeping track of the progress prior to removing a few duplicates
print("nodesWithoutParents length=" + str(len(nodesWithoutParents)) + ", nodesWithParents length=" + str(len(nodesWithParents)))
    
for node in nodesWithoutParents:
    node['treenode'] = Node(node.get('id')) 
    
for node in nodesWithParents:
    node['treenode'] = Node(node.get('id')) 
    
for node in nodesWithParents:
    found = False
    for nWithp in nodesWithParents:
        if node.get('parentId') == nWithp.get('id'):
            node['treenode'].parent = nWithp.get('treenode')  
            found = True
            break
    if not found:
        for nWithoutp in nodesWithoutParents:
            if node.get('parentId') == nWithoutp.get('id'):
                node['treenode'].parent = nWithoutp.get('treenode')  
                found = True
                break
    if not found:
        node['treenode'] = Node(node.get('id'))
        print("ORPHAN ? " + node.get('id'))
        
# check for top-levels that are not really top level, just happened to be first
for index, node in enumerate(nodesWithoutParents):
    if any(nd.get('id') == node.get('id') for nd in nodesWithParents):
#        remove mislabeled top-level        
        nodesWithoutParents.remove(node) 
#        print("Mislabeled empty top-level removed"+ str(node))
        
for node in nodesWithoutParents:
    print(RenderTree(node.get('treenode')))    

nodesWithoutParents length=22nodesWithParents length=11575
Mislabeled empty top-level removed{'id': 'NS:1ba169b7-7326-4951-93dd-3c4f7883987b-Source', 'label': 'Publications by Business Sectors', 'group': 'Source', 'readable': 'Source:1ba169b7-7326-4951-93dd-3c4f7883987b-Source', 'searchable': False, 'childrenCount': 627, 'treenode': Node('/NS:1ba169b7-7326-4951-93dd-3c4f7883987b-Source')}
Mislabeled empty top-level removed{'id': 'NS:6f68d84f-8c18-4b24-8a64-ff500028a1ed-Source', 'label': 'Publications by Geographies', 'group': 'Source', 'readable': 'Source:6f68d84f-8c18-4b24-8a64-ff500028a1ed-Source', 'searchable': False, 'childrenCount': 904, 'treenode': Node('/NS:6f68d84f-8c18-4b24-8a64-ff500028a1ed-Source')}
Mislabeled empty top-level removed{'id': 'NS:05ec4016-19f7-4370-b2b9-48bc802be1bc-Source', 'label': 'Publications by Types', 'group': 'Source', 'readable': 'Source:05ec4016-19f7-4370-b2b9-48bc802be1bc-Source', 'searchable': False, 'childrenCount': 952, 'treenode': Node('/NS:05ec4