In [7]:
import json
import pandas as pd
from urllib.parse import urlparse

In [9]:
bulk_data = [
    {
        "data": {
            "collection": 85,
            "url": "exo.mast.stsci.edu/docs/",
            "excluded": False,
            "title": "Root URL is this one",
        },
        "children": []
    }
]

def add_to_root_node(path, title, root_node):
    """Recursively figure out where to add a URL to the root node."""
    path_parts = path.split('/')
    if path_parts[0] == '':
        path_parts = path_parts[1:]
    if path_parts[0] == '':
        path_parts = path_parts[1:]
    if path_parts[-1] == '':
        path_parts = path_parts[:-1]
    if path_parts[-1] == '':
        path_parts = path_parts[:-1]
    if len(path_parts) == 0:
        return
    if len(path_parts) == 1:
        root_node['children'].append({
            "data": {
                "collection": 85,
                "url": path,
                "excluded": False,
                "title": title,
            },
        })
        return
    for child in root_node['children']:
        if child['data']['url'] == path_parts[0]:
            add_to_root_node('/'.join(path_parts[1:]), title, child)
            return
    new_node = {
        "data": {
            "collection": 85,
            "url": path_parts[0],
            "excluded": False,
            "title": path_parts[0],
        },
        "children": []
    }
    root_node['children'].append(new_node)
    add_to_root_node('/'.join(path_parts[1:]), title, new_node)

with open('urls.jsonl', 'r') as jsonfile:
    urls = jsonfile.readlines()
    root_node = {
        "data": {
            "collection": 85,
            "url": json.loads(urls[0])['url'],
            "excluded": False,
            "title": json.loads(urls[0])['title'],
        },
        "children": []
    }
    bulk_data.append(root_node)

    for url in urls[1:]:
        url = json.loads(url)
        parsed = urlparse(url['url'])
        add_to_root_node(parsed.path, url['title'], root_node)
        break

/docs/spectra_ws.html


In [12]:
path = '/dvdata/kepler/8394721/phaseplot/?tce=2'
path.split('/')

['', 'dvdata', 'kepler', '8394721', 'phaseplot', '?tce=2']

In [5]:
url

{'url': 'https://exo.mast.stsci.edu/api/v0.1/dvdata/kepler/8394721/phaseplot/?tce=2',
 'title': 'Phased Light Curve'}

In [26]:
from anytree import Node, RenderTree

root = Node('root')

def add_to_tree(path, parent=root):
    splits = path.split('/')
    if splits[0] == '':
        parent = root
    
    for split in splits:
        node = Node(split, parent=parent)
        add_to_tree('/'.join(splits[1:]), parent=node)

with open('urls.jsonl', 'r') as jsonfile:
    urls = jsonfile.readlines()

    root = Node('root')
    for url in urls:
        url = json.loads(url)
        parsed = urlparse(url['url'])
        add_to_tree(parsed.path)

print(RenderTree(root))

Node('/root')


In [27]:
with open('urls.jsonl', 'r') as jsonfile:
    urls = jsonfile.readlines()

In [36]:
lst = [urlparse(json.loads(url)['url']).path.split('/')[1:] for url in urls]

In [38]:
from collections import defaultdict

tree = lambda: defaultdict(tree)

def make_tree(lst):
    d = tree()    
    for x in lst:
        curr = d
        for item in x:
             curr = curr[item]
    return d

d = make_tree(lst)

def make_strs(d, indent=0):
     strs = []
     for k, v in d.items():
         strs.append('    ' * indent + str(k))
         strs.extend(make_strs(v, indent+1))
     return strs

def print_tree(d):
    print('\n'.join(make_strs(d)))

# print_tree(d)

In [55]:
def make_bulk_data(d, level=0):
    children = []
    for k, v in d.items():
        if not k:
            continue
        children.append({"data": {"url": k,}, "children": make_bulk_data(v, level+1)})
    return children

make_bulk_data(d)

[{'data': {'url': 'docs'},
  'children': [{'data': {'url': 'spectra_ws.html'}, 'children': []},
   {'data': {'url': '_sources'},
    'children': [{'data': {'url': 'index.rst.txt'}, 'children': []},
     {'data': {'url': 'spectra_ws.rst.txt'}, 'children': []},
     {'data': {'url': 'exoplanets_ws.rst.txt'}, 'children': []},
     {'data': {'url': 'swagger'},
      'children': [{'data': {'url': 'index.rst.txt'}, 'children': []}]},
     {'data': {'url': 'dvdata_ws.rst.txt'}, 'children': []},
     {'data': {'url': 'getting_started.rst.txt'}, 'children': []}]},
   {'data': {'url': 'exoplanets_ws.html'}, 'children': []},
   {'data': {'url': 'dvdata_ws.html'}, 'children': []},
   {'data': {'url': 'swagger'},
    'children': [{'data': {'url': 'index.html'}, 'children': []}]},
   {'data': {'url': 'getting_started.html'}, 'children': []},
   {'data': {'url': 'index.html'}, 'children': []}]},
 {'data': {'url': 'api'},
  'children': [{'data': {'url': 'v0.1'},
    'children': [{'data': {'url': 'dvda

In [2]:
!pip3 install requests

Collecting requests
  Using cached requests-2.28.2-py3-none-any.whl (62 kB)
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.7/123.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting idna<4,>=2.5
  Using cached idna-3.4-py3-none-any.whl (61 kB)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.15-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting certifi>=2017.4.17
  Using cached certifi-2022.12.7-py3-none-any.whl (155 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2022.12.7 charset-normalizer-3.1.0 idna-3.4 requests-2.28.2 urllib3-1.26.15

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is av

In [3]:
'''
Example to read API and filter data using facets
'''
import requests
import json

# Define the API endpoint URL
url = "https://jsonplaceholder.typicode.com/posts/1"

# Define the facets to filter the API response
facets = ["userId", "title"]

# Send a GET request to the API endpoint
response = requests.get(url)

# If the request was successful, parse the response content as a JSON object
if response.status_code == 200:
    data = json.loads(response.text)
    print("data", data)

    # Create a new dictionary with only the specified facets
    filtered_data = {facet: data[facet] for facet in facets}

    # Print the filtered data
    print(filtered_data)
else:
    print("Error: Failed to retrieve data from API")

data {'userId': 1, 'id': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit', 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}
{'userId': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit'}
