In [6]:
from typing import List, Dict, Optional, Any
from enum import Enum
from pydantic import AnyUrl, ValidationError, BaseModel
import os
from six.moves.urllib.error import HTTPError, URLError
import pandas as pd
from dlhub_sdk import DLHubClient
from mdf_forge import Forge
import json

class FoundryType(Enum):
    tabular="tabular"
    files="files"
    other="other"
    
class FoundrySplit(BaseModel):
    pass
        
class FoundryBase(BaseModel):
    inputs: List = []
    outputs: List = []
    input_descriptions: Optional[List] = []
    output_descriptions: Optional[List] = []
    type: FoundryType = None
    uri:Optional[List[AnyUrl]] = []
    hash: Optional[str] = []
    references: Optional[List[str]] = []
    dataframe: Optional[Any] = None
        
    class Config:
        arbitrary_types_allowed = True

class FoundryConfig(BaseModel):
    dataframe_file: Optional[str] = "" 
    metadata_file: Optional[str] = ""
    from_file = True
    local_cache_dir = ""
    
class Foundry(BaseModel):
    dc: Optional[Dict] = {} #pydantic datacite?
    foundry: FoundryBase = {}
    config: FoundryConfig = FoundryConfig(dataframe_file="foundry_dataframe.json",
                                          metadata_file="foundry_metadata.json",
                                          from_file=True,
                                          local_cache_dir="~/.foundry")
    class Config:
        arbitrary_types_allowed = True

class FoundryClient(Foundry):
    dlhub_client = DLHubClient()
    forge_client = Forge('mdf-test')
    
    def describe(self):
        print("DC:{}".format(self.dc))
        print("Inputs:{}".format(self.input_descriptions))
        print("Outputs:{}".format(self.output_descriptions))
        print(self.output_descriptions)
    
    def from_file(self, file=None):
        if file is None: file= self.config.metadata_file
        with open ("./{}".format(file)) as fp:
            obj = json.load(fp)
            return FoundryClient(**obj)
            


In [17]:
from foundry import Foundry
f = Foundry(**external_data)

In [20]:
f.dataframe

In [15]:
external_data = {
    'dc':{'titles':[{"title":"My Dataset Title"}]},
    'dataset':{
        'inputs': ['a'],
        'outputs': ['c'],
        'type': "tabular",
        'uri':["https://s3.amazonaws.com/keras-datasets/boston_housing.npz"],
        'hash':"asdaasdhahd87264283674",
        'references':["@abc"]
    }
   
}

In [64]:
ds.json(exclude={"dlhub_client","forge_client"})

'{"dc": {}, "foundry": {"inputs": ["configuration", "magnetic_moment", "stability", "delta_e", "total_energy", "volume_pa", "composition", "dft_converged", "dft_cutoff_energy", "dft_exchange_correlation_functional", "crystal_structure_number_of_atoms", "crystal_structure_space_group", "crystal_structure_volume"], "outputs": ["bandgap"], "input_descriptions": [], "output_descriptions": [], "type": "tabular", "uri": [], "hash": [], "references": []}, "dataframe": null, "config": {"dataframe_file": "foundry_dataframe.json", "metadata_file": "foundry_metadata.json", "from_file": true, "local_cache_dir": "~/.foundry"}}'

In [16]:
from foundry import Foundry
f=Foundry()
f.get_data("https","s3.amazonaws.com","/keras-datasets/boston_housing.npz")

Using TensorFlow backend.


NameError: name 'short_name' is not defined

In [14]:
f.get_data("https","s3.amazonaws.com","/keras-datasets/boston_housing.npz")

[0;31mSignature:[0m
[0mget_file[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfname[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0morigin[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muntar[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmd5_hash[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfile_hash[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_subdir[0m[0;34m=[0m[0;34m'datasets'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhash_algorithm[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mextract[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0marchive_format[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_dir[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Downloads a file from a URL if it not already in the cache.

By default the fil

In [2]:
external_data = {
    'inputs': ['a'],
    'outputs': ['c'],
    'type': "tabular",
    'uri':["https://s3.amazonaws.com/keras-datasets/boston_housing.npz"],
    'hash':"asdaasdhahd87264283674"
}

try:
    ds = FoundryDataset(**external_data)
except ValidationError as e:
    print(e)


In [3]:
ds

FoundryDataset(inputs=['a'], outputs=['c'], type=<FoundryType.tabular: 'tabular'>, uri=[AnyUrl('https://s3.amazonaws.com/keras-datasets/boston_housing.npz', scheme='https', host='s3.amazonaws.com', tld='com', host_type='domain', path='/keras-datasets/boston_housing.npz')], hash='asdaasdhahd87264283674')

In [31]:
print(FoundryDataset.schema_json(indent=2))

{
  "title": "FoundryDataset",
  "type": "object",
  "properties": {
    "inputs": {
      "title": "Inputs",
      "type": "array",
      "items": {}
    },
    "outputs": {
      "title": "Outputs",
      "type": "array",
      "items": {}
    },
    "type": {
      "title": "Type",
      "type": "string"
    }
  },
  "required": [
    "inputs",
    "outputs",
    "type"
  ]
}
