# NYC OpenData: Data Set Lister
This script lists all the data sets given by a NYC OpenData URL. (The script scrapes the website successfully as of September 14, 2021.)

In [None]:
!pip install esprima
import re
import requests
import esprima
import json
from os.path import isfile
from urllib.parse import urljoin
from time import sleep
from bs4 import BeautifulSoup, Tag
from itertools import chain, islice
from random import randint
from pprint import pprint

#@title Settings
url = "https://data.cityofnewyork.us/browse?tags=permit" #@param {type:"string"}
cache = "cache.json" #@param {type:"string"}
use_cache = "Yes" #@param ["Yes", "No"]
#@markdown The `use_cache` setting above only affects getting search results.
#@markdown Activating the setting means the provided `url` will *not* be used if
#@markdown the cache file exists.

# A regular expression to match all whitespace except line breaks
whitespace_re = re.compile(r'[^\S\r\n]+', re.DOTALL)

# Loads a JSON a file
def load_from_cache(path):
  with open(path, 'r') as cache_file:
    return json.load(cache_file)

# Gets the first item in a subscriptable object or None
def first(x):
  if x:
    return x[0]

# Removes excess whitespace inside and out
def strip(value):
  return whitespace_re.sub(' ', value.strip())

# Scrapes the supplied NYC OpenData *browse* url for items
def get_data_sets(url):
  delay_factor = 1 # This delay factor will increase exponentially on errors.
  data_sets = {}
  next_url = url
  while next_url:
    print(f'Getting search results from {next_url}...')
    try:
      page_data_sets, next_url = parse_results_page(next_url)
      data_sets.update(page_data_sets)
      if next_url:
        next_url = urljoin(url, next_url)
        delay_factor = max(1, delay_factor // 2)
        sleep(randint(2, 4) * delay_factor) # Take it nice and easy;
                                            # the server will be angry otherwise.
    except Exception as e:
      print(f'An error occured while getting the search results: {e}')
      delay_factor = min(1800, delay_factor * 2)
      sleep(randint(3, 7) * delay_factor)
  print(f'Finished extracting {len(data_sets)} search results.')
  return data_sets

# Loads and parses the results page, returning the items and the next page’s url
def parse_results_page(url):
  response = requests.get(url)
  response.raise_for_status() # Raises an error if the request is not successful
  soup = BeautifulSoup(response.text) # Parses the raw HTML into a structure
  data_sets = {result_element.get('data-view-id'): element_to_dict(result_element)
               for result_element
               in soup.select('.browse2-result')}
  next_url = extract_next_url(soup)
  return (data_sets, next_url)

# Extracts information about each result into a dictionary
def element_to_dict(element):
  def get_element(selector):
    return first(element.select(selector))
  def get_link(selector):
    element = get_element(selector)
    if element:
      return element.get('href')
  def get_text(selector):
    element = get_element(selector)
    if element:
      return strip(element.text)
  timestamp_element = get_element('.browse2-result-timestamp-value > '
                                + '[data-rawdatetime]')
  return {'name': get_text('.browse2-result-name-link'),
          'link': get_link('.browse2-result-name-link'),
          'category': get_text('.browse2-result-category'),
          'type': get_text('.browse2-result-type-name'),
          'description': get_text('.browse2-result-description'),
          'tags': list(map(Tag.get_text, element.select('.browse2-result-topic'))),
          'updated': int(timestamp_element.get('data-rawdatetime')), # UNIX time
          'apiDocLink': get_link('.browse2-result-api-link')}

# Adds details to each item by modifying its dictionary in-place
def get_details(data_sets):
  delay_factor = 1 # This delay factor will increase exponentially on errors.
  for id in data_sets:
    data_set = data_sets[id]
    if 'dataDownloads' in data_set or \
       'attachments' in data_set or \
       'columns' in data_set:
       continue # Skip items with any of those keys already.
    try:
      data_set_name = data_set['name']
      details_url = data_set['link']
      print(f'Getting details for {data_set_name} from {details_url}...')
      sleep(randint(2, 4) * delay_factor) # Take it nice and easy;
                                          # the server will be angry otherwise.
      data_set_information, initial_state = parse_details_page(details_url)
      if data_set_information and data_set_information['distribution']:
        data_downloads = [{'contentUrl': data_download['contentUrl'],
                           'encodingFormat': data_download['encodingFormat']}
                          for data_download
                          in data_set_information['distribution']]
        if data_downloads:
          print(f'\t{data_set_name} has {len(data_downloads)} data downloads.')
          data_sets[id]['dataDownloads'] = data_downloads
        else:
          print(f'\tNo data downloads were found for {data_set_name}.')
      attachments = extract_attachments(url, initial_state)
      if attachments:
        print(f'\t{data_set_name} has {len(attachments)} attachments.')
        data_sets[id]['attachments'] = attachments
      else:
        print(f'\tNo attachments were found for {data_set_name}.')
      columns = extract_column_schema(initial_state)
      if columns:
        print(f'\t{data_set_name} has {len(columns)} columns.')
        data_sets[id]['columns'] = columns
      else:
        print(f'\tNo columns were found for {data_set_name}.')
      delay_factor = max(1, delay_factor // 2)
    except Exception as e:
      print(f'An error occured while getting the details: {e}')
      delay_factor = min(300, delay_factor * 2)

# Extracts the attachments information associated with the item
def extract_attachments(url_base, initial_state):
  if initial_state['view'] and initial_state['view']['attachments']:
    return {attachment['name']: urljoin(url_base, attachment['href'])
            for attachment
            in initial_state['view']['attachments']}

# Extracts the column schema information associated with the item
def extract_column_schema(initial_state):
  if initial_state['view'] and initial_state['view']['columns']:
    return [{'name': column['fieldName'],
             'type': column['dataTypeName'],
             'humanName': column['name']}
            for column
            in sorted(initial_state['view']['columns'],
                      key=lambda _: int(_['position']))]

# Digs into the item’s page to extract additional details
def parse_details_page(url):
  response = requests.get(url)
  response.raise_for_status()
  soup = BeautifulSoup(response.text)
  data_set_information = None # The information about the data set embedded in the page
  json_objects = extract_inline_json(soup)
  for json_object in json_objects:
    if json_object and json_object['@type'] == 'Dataset': # Found the information!
      data_set_information = json_object # Grab that JSON.
      break
  initial_state = None # The embedded information that the page uses to initialize its tables
  scripts = extract_inline_javascript(soup)
  for script in scripts:
    try:
      ast = esprima.parseScript(script, {'range': True}) # Guards against not-JavaScript
      if ast.type == 'Program' and \
         ast.sourceType == 'script' and \
         len(ast.body) == 1 and \
         ast.body[0].type == 'VariableDeclaration' and \
         len(ast.body[0].declarations) == 1: # Found the script with a single variable declaration
        declaration = ast.body[0].declarations[0]
        if declaration.type == 'VariableDeclarator' and \
           declaration.id.type == 'Identifier' and \
           declaration.id.name == 'initialState' and \
           declaration.init.type == 'ObjectExpression': # Found the initial state!
          json_start, json_end = declaration.init.range # Get the JSON range.
          initial_state = json.loads(script[json_start:json_end]) # Load the JSON.
          break
    except Exception as e:
      pass # Ignore the “script” if parsing it throws an exception.
  return (data_set_information, initial_state)

# Loads all of the inline JSON found in the page’s script tags
def extract_inline_json(soup):
  return [json.loads(element.text)
          for element
          in soup.select('script[type="application/ld+json"]')
          if element.text]

# Loads all of the inline JavaScript found in the page’s script tags
def extract_inline_javascript(soup):
  return [element.text for element in soup.select('script') if element.text]

# Gets the URL of the next results page
def extract_next_url(soup):
  element = soup.select('a.nextLink')
  if element:
    return element[0].get('href')

## Getting the Search Results
The code below extracts the data from the first search results page and all subsequent pages until it cannot find a link to the next page. For each page, it looks for the search result elements and maps each one to a dictionary. The dictionary schema is as follows:
* **name** (string): the name of the item
* **link** (string): the link to the page with more information about the item
* **category** (string): the category of the item (e.g., *Education*)
* **type** (string): the type of the item (e.g., *Dataset*)
* **description** (string): a description of the item
* **tags** (set of strings): a set of tags associated with the item
* **updated** (integer): the UNIX timestamp which this item was last updated
* **apiDocLink** (string): a link to the API documentation (which might possibly be used to extract more metadata about the item)

In [None]:
data_sets = load_from_cache(cache) if use_cache == 'Yes' and isfile(cache) else get_data_sets(url)

## Getting the Data Set Details
The code below extracts additional information about the items using the links to the items’ pages. It adds the following keys if the information is available:
* **attachments** (dict of strings): key-value pairs of file names and their corresponding links to download them
* **columns** (list of dicts): an ordered list of dicts representing column metadata
* **dataDownloads** (list of dicts of strings): a list of key-value pairs where the key is the file name and the value is the link to its URL

In the case where download of data set details is interrupted, the code below will attempt to resume progress. Simply, it checks each dictionary entry for the existence of the additional keys. If those don’t exist, it tries to retrieve them again and amends the dictionary.

In [None]:
get_details(data_sets)

## What’s Inside the Data Set?

In [None]:
pprint(data_sets)

## Caching Data
The code below will save a copy of the data to storage for loading and processing later. The file name is defined in the settings above.

In [None]:
with open(cache, 'w') as cache_file:
  json.dump(data_sets, cache_file)