#NYC OpenData: Data Set Lister
This script lists all the data sets given by a NYC OpenData URL. (The script scrapes the website successfully as of September 13, 2021.)

In [None]:
#@title Settings
url = "https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29" #@param {type:"string"}


The code below extracts the data from the first results page and all subsequent pages until it cannot find a link to the next page. For each page, it looks for the result elements and maps each one to a dictionary. The dictionary schema is as follows:
* **name** (string): the name of the item
* **link** (string): the link to the page with more information about the item
* **category** (string): the category of the item (e.g., *Education*)
* **type** (string): the type of the item (e.g., *Dataset*)
* **description** (string): a description of the item
* **tags** (set of strings): a set of tags associated with the item
* **updated** (integer): the UNIX timestamp which this item was last updated
* **apiDocLink** (string): a link to the API documentation (which might possibly be used to extract more metadata about the item)

In [None]:
import requests
from urllib.parse import urljoin
from time import sleep
from bs4 import BeautifulSoup
from itertools import chain

def first(x):
  if len(x) > 0:
    return x[0]

def get_data(url):
  data = []
  next_url = url
  try:
    while next_url:
        print(f'Getting data from {next_url}...')
        page_data, next_url = parse_page(next_url)
        data = chain(data, page_data)
        print(f'Extracted data.')
        if next_url:
          next_url = urljoin(url, next_url)
          print(f'Next page to retrieve: {next_url}')
          print()
          sleep(3) # Avoids hammering the server
  except Exception as e:
    print(f'An error occured while getting the data: {e}')
  return list(data)

def parse_page(url):
  response = requests.get(url)
  response.raise_for_status() # Raises an error if the request is not successful
  soup = BeautifulSoup(response.text) # Parses the raw HTML into a structure
  page_data = extract_results(soup)
  next_url = extract_next_url(soup)
  return (page_data, next_url)

def extract_results(soup):
  results_elements = soup.find_all('div', attrs={'class': 'browse2-result'})
  return map(element_to_dictionary, results_elements)

def element_to_dictionary(element):
  get_element = lambda selector: first(element.select(selector))
  def get_link(selector):
    element = get_element(selector)
    if element:
      return element.get('href')
  def get_text(selector):
    element = get_element(selector)
    if element:
      return element.text.strip()
  timestamp_element = get_element('.browse2-result-timestamp-value > '
                                + '[data-rawdatetime]')
  return {'name': get_text('.browse2-result-name-link'),
          'link': get_link('.browse2-result-name-link'),
          'category': get_text('.browse2-result-category'),
          'type': get_text('.browse2-result-type-name'),
          'description': get_text('.browse2-result-description'),
          'tags': {_.text for _ in element.select('.browse2-result-topic')},
          'updated': int(timestamp_element.get('data-rawdatetime')), # UNIX time
          'apiDocLink': get_link('.browse2-result-api-link')}

def extract_next_url(soup):
  element = soup.select('a.nextLink')
  if element:
    return element[0].get('href')

data = get_data(url)
print(f'Got {len(data)} results.')

Getting data from https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29...
Extracted data.
Next page to retrieve: https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=2

Getting data from https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=2...
Extracted data.
Next page to retrieve: https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=3

Getting data from https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=3...
Extracted data.
Next page to retrieve: https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=4

Getting data from https://data.cityofnewyork.us/browse?Dataset-Information_Agency=Department+of+Education+%28DOE%29&page=4...
Extracted data.
Next page to retrieve: https://data.cityofnewyork.us

In [None]:
from pprint import pprint

pprint(data) # Pretty-print the data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  'link': 'https://data.cityofnewyork.us/Education/2015-16-Health-Education-HS-Level-School-District/brxp-zcjz',
  'name': '2015-16 Health Education HS Level - School District',
  'tags': set(),
  'type': 'Dataset',
  'updated': 1556205500},
 {'apiDocLink': 'https://dev.socrata.com/foundry/data.cityofnewyork.us/np9k-hd4i',
  'category': 'Education',
  'description': None,
  'link': 'https://data.cityofnewyork.us/Education/2013-16-School-ELA-Data-Files-By-Grade-SWD/np9k-hd4i',
  'name': '2013-16 School ELA Data Files By Grade - SWD',
  'tags': {'2016', '2015', 'school ela data files by grade', '2014', '2013'},
  'type': 'Dataset',
  'updated': 1549923366},
 {'apiDocLink': 'https://dev.socrata.com/foundry/data.cityofnewyork.us/g54d-49wm',
  'category': 'Education',
  'description': 'In June 2012, 7 New York City public schools closed for poor '
                 'performance.  This report provides data regarding students '
 