Scikit learn comes with several
[datasets](https://scikit-learn.org/stable/datasets/index.html).
This notebook specifies JSON [schemas](https://json-schema.org/)
for a few representative ones.
Eventually these schemas will probably move to Lale's 
[datasets](https://github.ibm.com/aimodels/lale/tree/master/lale/datasets).

In [1]:
import sklearn.datasets
import lale.helpers
import sklearn.utils
import jsonschema

### digits (image classification)

In [2]:
digits_data = sklearn.datasets.load_digits()
assert isinstance(digits_data, sklearn.utils.Bunch)
digits_data = lale.helpers.data_to_json({**digits_data}, subsample_array = False)
print(digits_data.keys())
print('n_samples: {}'.format(len(digits_data['data'])))

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
n_samples: 1797


In [3]:
digits_schema = {
  '$schema': 'http://json-schema.org/draft-04/schema#',
  'type': 'object',
  'properties': {
    'data': {
      'type': 'array',
      'minItems': 1797, 'maxItems': 1797,
      'items': {
        'type': 'array',
        'minItems': 64, 'maxItems': 64,
        'items': {
          'type': 'number',
          'minimum': 0.0}}},
    'target': {
      'type': 'array',
      'minItems': 1797, 'maxItems': 1797,
      'items': {
        'type': 'integer',
        'minimum': 0,
        'maximum': 9}},
    'target_names': {
      'type': 'array',
      'minItems': 10, 'maxItems': 10,
      'items': [
        {'enum': [0]}, {'enum': [1]}, {'enum': [2]}, {'enum': [3]},
        {'enum': [4]}, {'enum': [5]}, {'enum': [6]}, {'enum': [7]},
        {'enum': [8]}, {'enum': [9]}]},
    'images': {
      'type': 'array',
      'minItems': 1797, 'maxItems': 1797,
      'items': {
        'type': 'array',
        'minItems': 8, 'maxItems': 8,
        'items': {
          'type': 'array',
          'minItems': 8, 'maxItems': 8,
          'items': {
            'type': 'number'},
            'minimum': 0.0}}},
    'DESCR': {
      'type': 'string'}}}
lale.helpers.validate_is_schema(digits_schema)

In [4]:
jsonschema.validate(digits_data, digits_schema)

### news (text classification)

In [5]:
news_data = sklearn.datasets.fetch_20newsgroups()
assert isinstance(news_data, sklearn.utils.Bunch)
news_data = lale.helpers.data_to_json({**news_data}, subsample_array = False)
print(news_data.keys())
print('n_samples: {}'.format(len(news_data['data'])))

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
n_samples: 11314


In [6]:
news_schema = {
  '$schema': 'http://json-schema.org/draft-04/schema#',
  'type': 'object',
  'properties': {
    'data': {
      'type': 'array',
      'minItems': 11314, 'maxItems': 11314,
      'items': {
        'type': 'string'}},
    'filenames': {
      'type': 'array',
      'minItems': 11314, 'maxItems': 11314,
      'items': {
        'type': 'string'}},
    'target_names': {
      'type': 'array',
      'minItems': 20, 'maxItems': 20,
      'items': [
        {'enum': ['alt.atheism']},
        {'enum': ['comp.graphics']},
        {'enum': ['comp.os.ms-windows.misc']},
        {'enum': ['comp.sys.ibm.pc.hardware']},
        {'enum': ['comp.sys.mac.hardware']},
        {'enum': ['comp.windows.x']},
        {'enum': ['misc.forsale']},
        {'enum': ['rec.autos']},
        {'enum': ['rec.motorcycles']},
        {'enum': ['rec.sport.baseball']},
        {'enum': ['rec.sport.hockey']},
        {'enum': ['sci.crypt']},
        {'enum': ['sci.electronics']},
        {'enum': ['sci.med']},
        {'enum': ['sci.space']},
        {'enum': ['soc.religion.christian']},
        {'enum': ['talk.politics.guns']},
        {'enum': ['talk.politics.mideast']},
        {'enum': ['talk.politics.misc']},
        {'enum': ['talk.religion.misc']}]},
    'target': {
      'type': 'array',
      'minItems': 11314, 'maxItems': 11314,
      'items': {
        'type': 'integer',
        'minimum': 0,
        'maximum': 19}},
    'DESCR': {
      'type': 'string'}}}
lale.helpers.validate_is_schema(news_schema)

In [7]:
jsonschema.validate(news_data, news_schema)

### boston (structured data, no categorical features, regression)

In [8]:
boston_data = sklearn.datasets.load_boston()
assert isinstance(boston_data, sklearn.utils.Bunch)
boston_data = lale.helpers.data_to_json({**boston_data}, subsample_array = False)
print(boston_data.keys())
print('n_samples: {}'.format(len(boston_data['data'])))

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
n_samples: 506


In [9]:
boston_schema = {
  '$schema': 'http://json-schema.org/draft-04/schema#',
  'type': 'object',
  'properties': {
    'data': {
      'type': 'array',
      'minItems': 506, 'maxItems': 506,
      'items': {
        'type': 'array',
        'minItems': 13, 'maxItems': 13,
        'items': [
          { 'description': 'CRIM, per capita crime rate by town',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'ZN, proportion of residential land zoned for lots over '
              '25,000 sq.ft.',
            'type': 'number', 'minimum': 0.0, 'maximum': 100.0},
          { 'description':
              'INDUS, proportion of non-retail business acres per town',
            'type': 'number', 'minimum': 0.0, 'maximum': 100.0},
          { 'description':
              'CHAS, Charles River dummy variable '
              '(= 1 if tract bounds river; 0 otherwise)',
            'enum': [0, 1]},
          { 'description':
              'NOX, nitric oxides concentration (parts per 10 million)',
            'type': 'number', 'minimum': 0.0},
          { 'description': 'RM, average number of rooms per dwelling',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'AGE, proportion of owner-occupied units built prior to 1940',
            'type': 'number', 'minimum': 0.0, 'maximum': 100.0},
          { 'description':
              'DIS, weighted distances to five Boston employment centres',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'RAD, index of accessibility to radial highways',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'TAX, full-value property-tax rate per $10,000',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'PTRATIO, pupil-teacher ratio by town',
            'type': 'number', 'minimum': 0.0},
          { 'description':
              'B, 1000(Bk - 0.63)^2 where Bk is the proportion of '
              'blacks by town',
            'type': 'number', 'minimum': 0.0},
          { 'description': 'LSTAT, % lower status of the population',
            'type': 'number', 'minimum': 0.0, 'maximum': 100.0},
          { 'description':
              "MEDV, Median value of owner-occupied homes in $1000's",
            'type': 'number', 'minimum': 0.0}]}},
    'target': {
      'type': 'array',
      'minItems': 506, 'maxItems': 506,
      'items': {
        'type': 'number',
        'minimum': 5.0,
        'maximum': 50.0}},
    'feature_names': {
      'type': 'array',
      'minItems': 13, 'maxItems': 13,
      'items': [
        {'enum': ['CRIM']},
        {'enum': ['ZN']},
        {'enum': ['INDUS']},
        {'enum': ['CHAS']},
        {'enum': ['NOX']},
        {'enum': ['RM']},
        {'enum': ['AGE']},
        {'enum': ['DIS']},
        {'enum': ['RAD']},
        {'enum': ['TAX']},
        {'enum': ['PTRATIO']},
        {'enum': ['B']},
        {'enum': ['LSTAT']}]},
    'DESCR': {
      'type': 'string'},
    'filename': {
      'type': 'string'}}}
lale.helpers.validate_is_schema(boston_schema)

In [10]:
jsonschema.validate(boston_data, boston_schema)

### mnist (image, from tensorflow datasets)

In [11]:
import tensorflow_datasets as tfds
#the downloaded files get stored in ~/tensorflow_datasets/mnist/1.0.0/
mnist_data = tfds.load("mnist", with_info=True)
print(tfds.list_builders())

I0709 09:40:59.892229 4531561920 dataset_builder.py:157] Overwrite dataset info from restored data version.
I0709 09:40:59.896569 4531561920 dataset_builder.py:193] Reusing dataset mnist (/Users/kakateus.ibm.com/tensorflow_datasets/mnist/1.0.0)
W0709 09:40:59.941802 4531561920 deprecation.py:323] From /Users/kakateus.ibm.com/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:423: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

['bair_robot_pushing_small', 'cats_vs_dogs', 'celeb_a', 'celeb_a_hq', 'cifar10', 'cifar100', 'coco2014', 'diabetic_retinopathy_detection', 'dummy_dataset_shared_generator', 'dummy_mnist', 'fashion_mnist', 'image_label_folder', 'imagenet2012', 'imdb_reviews', 'lm1b', 'lsun', 'mnist', 'moving_mnist', 'nsynth', 'omniglot', 'open_images_v4', 'quickdraw_bitmap', 'squad', 'starcraft_video', 'svhn_cropped', 'tf_flowers', 'wmt_translate_ende', 'wmt_translate_enfr']


In [12]:
import json
mnist_data = {
  'data': mnist_data[0],
  'info': json.loads(mnist_data[1].as_json)
}
mnist_data

{'data': {'test': <DatasetV1Adapter shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}>,
  'train': <DatasetV1Adapter shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}>},
 'info': {'citation': '@article{lecun2010mnist,\n  title={MNIST handwritten digit database},\n  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n  journal={ATT Labs [Online]. Available: http://yann. lecun. com/exdb/mnist},\n  volume={2},\n  year={2010}\n}\n',
  'description': 'The MNIST database of handwritten digits.',
  'downloadChecksums': {'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz': '8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6',
   'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz': 'f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6',
   'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz': '440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609',
   'http://y

In [13]:
mnist_schema = {
  '$schema': 'http://json-schema.org/draft-04/schema#',
  'type': 'object',
  'properties': {
    'data': {
      'type': 'object',
      'properties': {
        'test': {
          'type': 'object',
          'properties': {
            'image': {
              'type': 'array',
              'minItems': 10000, 'maxItems': 10000,
              'items': {
                'type': 'array',
                'minItems': 28, 'maxItems': 28,
                'items': {
                  'type': 'array',
                  'minItems': 28, 'maxItems': 28,
                  'items': {
                    'type': 'number',
                    'minimum': 0.0, 'maximum': 255.0}}}},
            'label': {
              'type': 'integer',
              'minimum': 0, 'maximum': 9}}},
        'train': {
          'type': 'object',
          'properties': {
            'image': {
              'type': 'array',
              'minItems': 60000, 'maxItems': 60000,
              'items': {
                'type': 'array',
                'minItems': 28, 'maxItems': 28,
                'items': {
                  'type': 'array',
                  'minItems': 28, 'maxItems': 28,
                  'items': {
                    'type': 'number',
                    'minimum': 0.0, 'maximum': 255.0}}}},
            'label': {
              'type': 'integer',
              'minimum': 0, 'maximum': 9}}}}},
    'info': {
      'type': 'object',
      'properties': {
        'citation': {'enum': [mnist_data['info']['citation']]}}}}}
lale.helpers.validate_is_schema(mnist_schema)