In [1]:
import io
import json
import pandas
from pprint import pprint
import requests
import zipfile

In [2]:
# Pretty-print results
def print_results(results):
    if not results:
        return
    for result in results:
        print(result['metadata']['name'])
        print('ID: ', result['id'])
        print('Score: ', result['score'])
        # Print augmentation information if provided
        if 'augmentation' in result and result['augmentation']['type'] != 'none':
            aug_type = result['augmentation']['type']
            print('Augmentation: %s' % aug_type)
            print("Left Columns: %s" %
                  str(result['augmentation']['left_columns_names']))
            print("Right Columns: %s" %
                  str(result['augmentation']['right_columns_names']))
            
        print("-------------------")

In [3]:
URL = 'https://auctus.vida-nyu.org/api/v1'

In [4]:
# REST API documentation: https://docs.auctus.vida-nyu.org/restapi.html
# Query and result schemas: https://docs.auctus.vida-nyu.org/schemas.html

In [5]:
# Search for COVID data via keywords
query = {
    'keywords': ['covid'],
}
response = requests.post(URL + '/search', data={'query': json.dumps(query)})
response.raise_for_status()
results = response.json()['results']
print_results(results)

covid per country
ID:  datamart.url.2e70cba96c4944198f174780b4eb382c
Score:  15.932485
-------------------
COVID Tracking Data (CSV) - USA States Daily
ID:  datamart.url.8eac7911c2cb5da0b763f3d3f86a13b8
Score:  14.701933
-------------------
NYC Coronavirus (COVID-19) data
ID:  datamart.url.9a47f900c06943ecb731d6213a5b883c
Score:  13.658955
-------------------
COVID-19 Hospitalizations
ID:  datamart.socrata.data-sfgov-org.nxjg-bhem
Score:  13.390009
-------------------
COVID-19 Cases Summarized by Race and Ethnicity
ID:  datamart.socrata.data-sfgov-org.vqqm-nsqg
Score:  13.261738
-------------------
NYC Coronavirus (COVID-19) from NYC Department of Health
ID:  datamart.url.7717c2dd7eac420eb8aa85224388a9f6
Score:  13.029905
-------------------
COVID-19 Tests
ID:  datamart.socrata.data-sfgov-org.nfpa-mg4g
Score:  12.92869
-------------------
COVID-19 Cases Summarized by Age Group and Gender
ID:  datamart.socrata.data-sfgov-org.sunc-2t3k
Score:  12.534885
-------------------
COVID-19 Cases

In [6]:
# Show the full metadata record for one dataset
result, = [r for r in results if r['id'] == 'datamart.upload.86a1bc1c58ad4814847454b3644dd6cb']
print(json.dumps(result, indent=2))

{
  "id": "datamart.upload.86a1bc1c58ad4814847454b3644dd6cb",
  "score": 8.187912,
  "metadata": {
    "columns": [
      {
        "name": "city",
        "num_distinct_values": 429,
        "semantic_types": [],
        "structural_type": "http://schema.org/Text"
      },
      {
        "name": "state",
        "num_distinct_values": 19,
        "plot": {
          "data": [
            {
              "bin": "AK",
              "count": 3
            },
            {
              "bin": "CT",
              "count": 4
            },
            {
              "bin": "DE",
              "count": 4
            },
            {
              "bin": "KY",
              "count": 5
            },
            {
              "bin": "ME",
              "count": 3
            }
          ],
          "type": "histogram_categorical"
        },
        "semantic_types": [
          "http://schema.org/Enumeration"
        ],
        "structural_type": "http://schema.org/Text"
      },
      {

```json
{
  "id": "datamart.upload.86a1bc1c58ad4814847454b3644dd6cb",
  "score": 8.27132,                             # Score
  "metadata": {
    "id": "datamart.upload.86a1bc1c58ad4814847454b3644dd6cb",

                                                # General information about the dataset
    "name": "Nonpharmaceutical Interventions (NPIs) by Jataware - by city",
    "description": "\"Nonpharmaceutical Interventions,\r\n or NPIs, are policy actions taken by communities to mitigate ...",
    "filename": "Jataware-City-NPIs.csv",
    "size": 1874173,
    "nb_rows": 558,
    "source": "upload",

    "materialize": {                            # Materialization information, e.g. plugin providing this dataset
      "date": "2020-04-09T18:36:38.507936Z",
      "identifier": "datamart.upload"
    },
    "date": "2020-04-09T18:36:50.958540Z",      # Date and version of profiling
    "version": "v0.5.7-16-g0e70d36",
    "nb_profiled_rows": 558,
    "columns": [                                # Information about each column, profiled automatically
      {
        "name": "city",
        "num_distinct_values": 429,
        "semantic_types": [],
        "structural_type": "http://schema.org/Text"
      },
      {
        "name": "state",
        "structural_type": "http://schema.org/Text",  # String
        "semantic_types": [
          "http://schema.org/Enumeration"             # Recognized as categorical data, 19 categories
        ],
        "num_distinct_values": 19
      },
      {
        "name": "publish_date",
        "missing_values_ratio": 0.3100358422939068,   # 31% of missing values
        "structural_type": "http://schema.org/Text",  # String
        "semantic_types": [
          "http://schema.org/DateTime"                # Recognized as dates
        ],
        "num_distinct_values": 200,
        "mean": 1584586043.5116882,                   # Numerical distribution (of the interpreted timestamp)
        "stddev": 513767.26253528317,
        "coverage": [
          {"range": {"gte": 1584835200.0, "lte": 1585470592.0}},
          {"range": {"gte": 1583205504.0, "lte": 1583972736.0}},
          {"range": {"gte": 1584115840.0, "lte": 1584702592.0}}
        ]
      },
      {
        "name": "url",
        "structural_type": "http://schema.org/Text",  # String with no special meaning for profiler
        "semantic_types": [],
        "num_distinct_values": 493
      },
      {
        "name": "title",
        "structural_type": "http://schema.org/Text",
        "semantic_types": [
          "http://schema.org/Text"
        ]
      },
      {
        "name": "text",
        "structural_type": "http://schema.org/Text",
        "semantic_types": [
          "http://schema.org/Text"
        ]
      },
      {
        "name": "category",
        "structural_type": "http://schema.org/Text",
        "semantic_types": [
          "http://schema.org/Enumeration"              # Recognized as categorical data, 9 categories
        ],
        "num_distinct_values": 9
      },
      {
        "name": "population",
        "structural_type": "http://schema.org/Integer",  # Integers
        "semantic_types": [],
        "mean": 54504.0376344086,                       # Numerical distribution
        "stddev": 154148.1766282727,
        "coverage": [
          {"range": {"gte": 178.0, "lte": 146758.0}},
          {"range": {"gte": 1854698.0, "lte": 2084749.0}},
          {"range": {"gte": 280082.0, "lte": 1011696.0}}
        ]
      },
      {
        "name": "latitude",
        "structural_type": "http://schema.org/Float",  # Floating point detected as latitude coordinate
        "semantic_types": [
          "http://schema.org/latitude"
        ],
        "mean": 36.94463136200716,
        "stddev": 4.592741270022868
      },
      {
        "name": "longitude",
        "structural_type": "http://schema.org/Float",  # Floating point detected as longitude coordinate
        "semantic_types": [
          "http://schema.org/longitude"
        ],
        "mean": -98.0607971326166,
        "stddev": 15.837849701298603
      }
    ],
    "spatial_coverage": [
      {
        "lat": "latitude",          # This is a simple case but the system can pair more complicated
        "lon": "longitude",         # column names and deal with abbreviations
        "ranges": [                 # Computed bounding boxes
          {"range": {"coordinates": [[-122.8651, 41.0803], [-104.7389, 33.3881]], "type": "envelope"}},
          {"range": {"coordinates": [[-95.5977, 42.3686], [-75.3104, 37.7171]], "type": "envelope"}},
          {"range": {"coordinates": [[-92.5715, 34.2466], [-80.1842, 26.1505]], "type": "envelope"}}
        ]
      }
    ]
  },
  "augmentation": {                   # Augmentation information (if providing an input dataset, see below)
    "type": "none",
    "left_columns": [],
    "left_columns_names": [],
    "right_columns": [],
    "right_columns_names": []
  },
  "supplied_id": null,
  "supplied_resource_id": null,
  "d3m_dataset_description": {...}     # Metadata in D3M format
}
```

In [7]:
# Let's download the dataset
response = requests.get(URL + '/download/' + result['id'])
response.raise_for_status()
# It's a CSV, load it
pandas.read_csv(io.BytesIO(response.content)).head()

Unnamed: 0,city,state,publish_date,url,title,text,category,population,latitude,longitude
0,Albertville,AL,3/17/20 0:00,https://www.gadsdentimes.com/news/20200317/alb...,Albertville declares coronavirus emergency,The Albertville City Council on Monday declare...,state_of_emergency,37220,34.2634,-86.2107
1,Birmingham,AL,3/24/20 18:08,https://www.al.com/news/2020/03/woodfin-propos...,Shelter-in-place order enacted in Birmingham t...,The Birmingham City Council approved an ordina...,shelter_in_place,744189,33.5277,-86.7987
2,Boaz,AL,3/24/20 0:00,https://www.gadsdentimes.com/news/20200324/boa...,Boaz declares state of emergency over virus,The Boaz City Council on Monday declared a sta...,state_of_emergency,9652,34.1985,-86.1529
3,Carolina,AL,3/27/20 0:00,https://www.cnbc.com/2020/03/27/north-carolina...,North Carolina Gov. Roy Cooper orders resident...,"Governor Roy Cooper, D-NC address the crowd du...",shelter_in_place,295,31.2337,-86.5227
4,Clanton,AL,3/17/20 0:00,https://www.clantonadvertiser.com/2020/03/17/j...,Jemison Council approves COVID-19 response mea...,By JOYANNA LOVE/ Senior Staff Writer\n\nThe Ci...,,5876,32.8439,-86.623


In [8]:
# Can download in other formats, such as D3M's custom format
response = requests.get(URL + '/download/' + result['id'] + '?format=d3m&format_need_d3mindex=1')
response.raise_for_status()
# D3M format is a directory, so we get a ZIP
zip_ = zipfile.ZipFile(io.BytesIO(response.content), 'r')
print(zip_.namelist())
# Open the CSV
pandas.read_csv(zip_.open('tables/learningData.csv')).head()
# Note that a 'd3mIndex' column was generated, D3M tools require it for machine-learning workflows
# (optional via format options, here 'need_d3mindex')

['datasetDoc.json', 'tables/learningData.csv']


Unnamed: 0,d3mIndex,city,state,publish_date,url,title,text,category,population,latitude,longitude
0,0,Albertville,AL,3/17/20 0:00,https://www.gadsdentimes.com/news/20200317/alb...,Albertville declares coronavirus emergency,The Albertville City Council on Monday declare...,state_of_emergency,37220,34.2634,-86.2107
1,1,Birmingham,AL,3/24/20 18:08,https://www.al.com/news/2020/03/woodfin-propos...,Shelter-in-place order enacted in Birmingham t...,The Birmingham City Council approved an ordina...,shelter_in_place,744189,33.5277,-86.7987
2,2,Boaz,AL,3/24/20 0:00,https://www.gadsdentimes.com/news/20200324/boa...,Boaz declares state of emergency over virus,The Boaz City Council on Monday declared a sta...,state_of_emergency,9652,34.1985,-86.1529
3,3,Carolina,AL,3/27/20 0:00,https://www.cnbc.com/2020/03/27/north-carolina...,North Carolina Gov. Roy Cooper orders resident...,"Governor Roy Cooper, D-NC address the crowd du...",shelter_in_place,295,31.2337,-86.5227
4,4,Clanton,AL,3/17/20 0:00,https://www.clantonadvertiser.com/2020/03/17/j...,Jemison Council approves COVID-19 response mea...,By JOYANNA LOVE/ Senior Staff Writer\n\nThe Ci...,,5876,32.8439,-86.623


In [9]:
# Ask server for datasets related to this result
response = requests.post(URL + '/search', data={'data_id': result['id']})
response.raise_for_status()
results_related = response.json()['results']
print_results(results_related)
# Notice the proposed augmentations (unions or joins)

Parking Signs / Street Space Permit Photos
ID:  datamart.socrata.data-sfgov-org.pigs-fac7
Score:  1.0
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['DateAdded']]
-------------------
Nonpharmaceutical Interventions (NPIs) by Jataware - by city
ID:  datamart.upload.488278046c6c4d9eb2ac2e417b324d47
Score:  0.8
Augmentation: union
Left Columns: [['city'], ['url'], ['title'], ['text'], ['state'], ['category'], ['publish_date'], ['population']]
Right Columns: [['city'], ['url'], ['title'], ['text'], ['state'], ['category'], ['publish_date'], ['population']]
-------------------
Office of The Comptroller: Police Retirement System Holdings Data
ID:  datamart.socrata.data-cityofnewyork-us.dy3p-ay2d
Score:  1.0
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['Maturity Date']]
-------------------
Special Vaccine Locations
ID:  datamart.socrata.data-cityofchicago-org.9a77-69d3
Score:  0.5
Augmentation: union
Left Columns: [['city'], ['url'], ['state'], ['

In [10]:
# Add a keyword filter
query = {
    'keywords': ['coronanet'],
}
response = requests.post(URL + '/search', data={'query': json.dumps(query), 'data_id': result['id']})
response.raise_for_status()
results_related = response.json()['results']
print_results(results_related)

CoronaNet Database BETA Version 1.0 (core)
ID:  datamart.url.de434a1d1d7342deb13439c02ac39926
Score:  9.1313925
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['date_announced']]
-------------------
Parking Signs / Street Space Permit Photos
ID:  datamart.socrata.data-sfgov-org.pigs-fac7
Score:  1.0
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['DateAdded']]
-------------------
Office of The Comptroller: Police Retirement System Holdings Data
ID:  datamart.socrata.data-cityofnewyork-us.dy3p-ay2d
Score:  1.0
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['Maturity Date']]
-------------------
[Deprecated] Form 700 Schedule A2
ID:  datamart.socrata.data-sfgov-org.j82c-uj4d
Score:  1.0
Augmentation: join
Left Columns: [['publish_date']]
Right Columns: [['Date Disposed']]
-------------------
Active Projects - Public Buildings
ID:  datamart.socrata.data-cityofnewyork-us.g9ub-hrve
Score:  1.0
Augmentation: join
Left Columns: [[

In [11]:
result_related, = [r for r in results_related if r['id'] == 'datamart.socrata.data-cityofnewyork-us.g9ub-hrve']
print(json.dumps(result_related, indent=2))

{
  "id": "datamart.socrata.data-cityofnewyork-us.g9ub-hrve",
  "score": 1.0,
  "metadata": {
    "columns": [
      {
        "name": "Project ID",
        "num_distinct_values": 320,
        "semantic_types": [],
        "structural_type": "http://schema.org/Text"
      },
      {
        "name": "Description",
        "num_distinct_values": 307,
        "semantic_types": [],
        "structural_type": "http://schema.org/Text"
      },
      {
        "name": "Client Agency",
        "num_distinct_values": 16,
        "plot": {
          "data": [
            {
              "bin": "CORRECTIONS",
              "count": 1
            },
            {
              "bin": "ENVIRONMENTAL PROTECTION",
              "count": 5
            },
            {
              "bin": "FIRE",
              "count": 2
            },
            {
              "bin": "HPD",
              "count": 2
            },
            {
              "bin": "UNKNOWN",
              "count": 1
            }
 

In [12]:
# Perform the augmentation, by just sending the JSON result again as the 'task'
response = requests.post(
    URL + '/augment', 
    files={
        'data_id': io.BytesIO(result['id'].encode('ascii')),
        'task': ('task.json', json.dumps(result_related), 'application/json'),
    },
)
response.raise_for_status()

In [13]:
# In D3M format by default (ZIP)
zip_ = zipfile.ZipFile(io.BytesIO(response.content), 'r')
print(zip_.namelist())
# Open the CSV
pandas.read_csv(zip_.open('tables/learningData.csv')).head()

['datasetDoc.json', 'tables/learningData.csv']


Unnamed: 0,city,state,publish_date,url,title,text,category,population,latitude,longitude,Project ID,Description,Client Agency,Division,Phase,Scope,Dollar Amount,Status
0,Albertville,AL,3/17/20 0:00,https://www.gadsdentimes.com/news/20200317/alb...,Albertville declares coronavirus emergency,The Albertville City Council on Monday declare...,state_of_emergency,37220,34.2634,-86.2107,,,,,,,,
1,Birmingham,AL,3/24/20 18:08,https://www.al.com/news/2020/03/woodfin-propos...,Shelter-in-place order enacted in Birmingham t...,The Birmingham City Council approved an ordina...,shelter_in_place,744189,33.5277,-86.7987,,,,,,,,
2,Boaz,AL,3/24/20 0:00,https://www.gadsdentimes.com/news/20200324/boa...,Boaz declares state of emergency over virus,The Boaz City Council on Monday declared a sta...,state_of_emergency,9652,34.1985,-86.1529,,,,,,,,
3,Carolina,AL,3/27/20 0:00,https://www.cnbc.com/2020/03/27/north-carolina...,North Carolina Gov. Roy Cooper orders resident...,"Governor Roy Cooper, D-NC address the crowd du...",shelter_in_place,295,31.2337,-86.5227,,,,,,,,
4,Clanton,AL,3/17/20 0:00,https://www.clantonadvertiser.com/2020/03/17/j...,Jemison Council approves COVID-19 response mea...,By JOYANNA LOVE/ Senior Staff Writer\n\nThe Ci...,,5876,32.8439,-86.623,,,,,,,,


In [14]:
result_related, = [r for r in results_related if r['id'] == 'datamart.url.de434a1d1d7342deb13439c02ac39926']
print(json.dumps(result_related, indent=2))

{
  "id": "datamart.url.de434a1d1d7342deb13439c02ac39926",
  "score": 9.1313925,
  "metadata": {
    "columns": [
      {
        "coverage": [
          {
            "range": {
              "gte": 163130.0,
              "lte": 3302534.0
            }
          },
          {
            "range": {
              "gte": 4149972.0,
              "lte": 7036378.0
            }
          },
          {
            "range": {
              "gte": 7478282.0,
              "lte": 9859255.0
            }
          }
        ],
        "mean": 5155566.8963898,
        "name": "record_id",
        "plot": {
          "data": [
            {
              "bin_end": 1004379.4,
              "bin_start": 5036.0,
              "count": 2065
            },
            {
              "bin_end": 2003722.8,
              "bin_start": 1004379.4,
              "count": 1506
            },
            {
              "bin_end": 3003066.2,
              "bin_start": 2003722.8,
              "count": 15

In [None]:
# Perform the augmentation, by just sending the JSON result again as the 'task'
response = requests.post(
    URL + '/augment', 
    files={
        'data_id': io.BytesIO(result['id'].encode('ascii')),
        'task': ('task.json', json.dumps(result_related), 'application/json'),
    },
)
response.raise_for_status()