# EXPLORATIONS

In [117]:
import requests
import urllib3
import h5py
from src.ExplorerSB.project import Project
import src.ExplorerSB.constants as cn
import pandas as pd
import typing

# Reading a file from a URL

In [7]:
# This code retrieves a file
url = 'https://storage.googleapis.com/files.biosimulations.org/simulations/621d90b9b50991044c7a1ea6/contents/iYS854.xml'
r = requests.get(url, allow_redirects=True)
open('iYS854.xml', 'wb').write(r.content)

6273418

In [9]:
urllib3.util.parse_url(url)

Url(scheme='https', auth=None, host='storage.googleapis.com', port=None, path='/files.biosimulations.org/simulations/621d90b9b50991044c7a1ea6/contents/iYS854.xml', query=None, fragment=None)

In [13]:
def getUrlFile(file_url):
    splits = file_url.split("/")
    return splits[-1]

getUrlFile(url)

'iYS854.xml'

# HDF5 files

In [7]:
!ls ../local/cache/61fea483f499ccf25faafc4d/outputs

log.yml    reports.h5


In [8]:
path = "../local/cache/61fea483f499ccf25faafc4d/outputs/reports.h5"
fd5 = h5py.File(path, 'r')

In [9]:
dir(fd5)

['_MutableMapping__marker',
 '__abstractmethods__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_d',
 '_e',
 '_gcpl_crt_order',
 '_id',
 '_ipython_key_completions_',
 '_lapl',
 '_lcpl',
 '_libver',
 'attrs',
 'build_virtual_dataset',
 'clear',
 'close',
 'copy',
 'create_dataset',
 'create_dataset_like',
 'create_group',
 'create_virtual_dataset',
 'driver',
 'file',
 'filename',
 'flush',
 'get',
 'id',
 'items',
 'key

In [11]:
[t for t in fd5.items()]

[('simulation.sedml', <HDF5 group "/simulation.sedml" (3 members)>)]

In [12]:
fd5.keys()

<KeysViewHDF5 ['simulation.sedml']>

In [14]:
fd5["simulation.sedml"].keys()

<KeysViewHDF5 ['report_del_Cln2', 'report_del_Cln3', 'report_wt']>

In [23]:
dataset = fd5["simulation.sedml"]["report_wt"]
dataset.shape

(19, 101)

In [24]:
dataset[:,:]

array([[  0.,   1.,   2., ...,  98.,  99., 100.],
       [  1.,   0.,   1., ...,   1.,   1.,   1.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [  1.,   0.,   0., ...,   1.,   1.,   1.],
       [  0.,   0.,   0., ...,   1.,   1.,   1.],
       [  1.,   0.,   0., ...,   0.,   0.,   1.]])

In [27]:
dataset.attrs.keys()

<KeysViewHDF5 ['_type', 'sedmlDataSetDataTypes', 'sedmlDataSetIds', 'sedmlDataSetLabels', 'sedmlDataSetNames', 'sedmlDataSetShapes', 'sedmlId', 'sedmlName', 'uri']>

In [31]:
dataset.attrs['sedmlDataSetIds']

array(['wt_data_set_time', 'wt_data_set_Cln3', 'wt_data_set_SMBF',
       'wt_data_set_Cln2', 'wt_data_set_Clb5', 'wt_data_set_Yhp1',
       'wt_data_set_Clb2', 'wt_data_set_SFF', 'wt_data_set_Cdc20',
       'wt_data_set_FEAR', 'wt_data_set_MEN', 'wt_data_set_Cdc14',
       'wt_data_set_Swi5', 'wt_data_set_CKI', 'wt_data_set_Cdh1',
       'wt_data_set_S', 'wt_data_set_B', 'wt_data_set_M',
       'wt_data_set_CD'], dtype=object)

In [33]:
PROJECT_IDs = ["iYS854", "Yeast-cell-cycle-Irons-J-Theor-Biol-2009"]

In [37]:
projects = [Project(i) for i in PROJECT_IDs]
_ = [p.initialize() for p in projects]

[None, None]

In [43]:
dct = {cn.PROJECT_ID: [], "simulation.sedml": [], "report1_sedmlDataSetIds": []}

In [101]:
# Check that the files all have the same structure
def run():
    num_good = 0
    num_bad = 0
    generator = Project.iterateProjects()
    for project in generator:
        hd5_path = project.getH5FilePath()
        fd5 = h5py.File(hd5_path, 'r')
        # Find a sedml key
        for key in fd5.keys():
            selected_key = None
            if ".sedml" in key:
                selected_key = key
                break
        if selected_key is None:
            print("***No *.sedml in %s" % project.project_id)
            num_bad += 1
        else:
            datasets = list(fd5[selected_key].values())
            selected_dataset = None
            for dataset in datasets:
                if ("report" in dataset.name) or ("plot" in dataset.name):
                    selected_dataset = dataset
            if selected_dataset is None:
                print ("***No report or plot data set in %s" % (project.project_id))
                continue
            if not "sedmlDataSetIds" in selected_dataset.attrs.keys():
                print ("***No sedmlDataSetIds for %s/%s" % (project.project_id, selected_dataset.name))
                num_bad += 1
            else:
                #print(dataset.shape, len(dataset.attrs["sedmlDataSetIds"]))
                #print(dataset[:,:])
                num_good += 1
    return num_good, num_bad
                
#
run()

***No report or plot data set in Escherichia-coli-resource-allocation-Bulovic-Metab-Eng-2019
***No report or plot data set in RBC-metabolism-Bordbar-Cell-Syst-2015
***No report or plot data set in e_coli_core
***No report or plot data set in iAB_RBC_283
***No report or plot data set in iAF1260
***No report or plot data set in iAF1260b
***No report or plot data set in iAF987
***No report or plot data set in iAM_Pb448
***No report or plot data set in iAM_Pc455
***No report or plot data set in iAM_Pf480
***No report or plot data set in iAM_Pk459
***No report or plot data set in iAM_Pv461
***No report or plot data set in iAPECO1_1312
***No report or plot data set in iB21_1397
***No report or plot data set in iBWG_1329
***No report or plot data set in iE2348C_1286
***No report or plot data set in iEC042_1314
***No report or plot data set in iEC1344_C
***No report or plot data set in iEC1349_Crooks
***No report or plot data set in iEC1356_Bl21DE3
***No report or plot data set in iEC1364_W
**

(421, 15)

In [134]:
def getH5Dataframes(path:str):
    """
    Recursively searches a Biosimulations HDF5 file for datasets.
    
    Args:
        path: path to HDF5 file
        
    Returns:
        DataFrames
           name: name of the dataset
           columns: variables
           rows: instance
    """
    def findDataframes(item, group_names, dfs):
        """
        Recursively searches groups for datasets with sedmlDataSetIds.
        
        Args:
            item: Group/Dataset
            group_names: list-str
            dfs: list-DataFrame
        Returns:
            list-DataFrame
        """
        names = list(group_names)
        names.append(item.name)
        if "Dataset" in str(type(item)):
            # Encountered a leaf in the container graph
            if "sedmlDataSetIds" in item.attrs.keys():
                index = list(item.attrs["sedmlDataSetIds"])
                df = pd.DataFrame(item[:,:], index=index)
                df = df.T
                df.name = "--".join(names)
                import pdb; pdb.set_trace()
                dfs.append(df)
                return dfs
        else:
            for key in item.keys():
                new_names = list(names)
                new_names.append(key)
                import pdb; pdb.set_trace()
                return findDataframes(item[key], new_names, dfs)
    #  
    with h5py.File(path, 'r') as fd:
        return findDataframes(fd, [], [])
    
# TESTS
path = "../local/cache/61fea483f499ccf25faafc4d/outputs/reports.h5"
dfs = getH5Dataframes(path)

> [0;32m/var/folders/02/lb248q1j50dch2pthzkvxmpr0000gn/T/ipykernel_96163/311445158.py[0m(42)[0;36mfindDataframes[0;34m()[0m
[0;32m     40 [0;31m                [0mnew_names[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m                [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m                [0;32mreturn[0m [0mfindDataframes[0m[0;34m([0m[0mitem[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m,[0m [0mnew_names[0m[0;34m,[0m [0mdfs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     43 [0;31m    [0;31m#[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     44 [0;31m    [0;32mwith[0m [0mh5py[0m[0;34m.[0m[0mFile[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0;34m'r'[0m[0;34m)[0m [0;32mas[0m [0mfd[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m/var/folders/02/lb248q1j50dch2pthzkvxmpr0000gn/T/ipykernel_96163/311445158.py[0m(42)[0;36mfindDataframes[0;34m()[0m
[0;32m     40 [0;31m                [0mnew_names[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m                [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m                [0;32mreturn[0m [0mfindDataframes[0m[0;34m([0m[0mitem[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m,[0m [0mnew_names[0m[0;34m,[0m [0mdfs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     43 [0;31m    [0;31m#[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     44 [0;31m    [0;32mwith[0m [0mh5py[0m[0;34m.[0m[0mFile[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0;34m'r'[0m[0;34m)[0m [0;32mas[0m [0mfd[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m/var/folders/02/lb248q1j50dch2pthzkvxmpr0000gn/T/ipykernel_96163/311445158.py[0m(35)[0;36mfindDataframes[0;34m()[0m
[0;32m     33 [0;31m                [0mdf[0m[0;34m.[0m[0mname[0m [0;34m=[0m [0;34m"--"[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mnames[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     34 [0;31m                [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 35 [0;31m                [0mdfs[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m                [0;32mreturn[0m [0mdfs[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  df


     del_Cln2_data_set_time  del_Cln2_data_set_Cln3  del_Cln2_data_set_SMBF  \
0                       0.0                     1.0                     1.0   
1                       1.0                     0.0                     0.0   
2                       2.0                     0.0                     0.0   
3                       3.0                     1.0                     0.0   
4                       4.0                     1.0                     1.0   
..                      ...                     ...                     ...   
96                     96.0                     0.0                     0.0   
97                     97.0                     0.0                     0.0   
98                     98.0                     1.0                     0.0   
99                     99.0                     1.0                     0.0   
100                   100.0                     1.0                     0.0   

     del_Cln2_data_set_Cln2  del_Cln2_data_set_Clb5

ipdb>  c


In [136]:
len(dfs)

1

In [110]:
project_id = "modeldb-206364"
project = Project(project_id)
project.initialize()
hd5_path = project.getH5FilePath()
fd5 = h5py.File(hd5_path, 'r')
fd5.keys()

<KeysViewHDF5 ['XPP']>

In [126]:
type(fd5['XPP']['aEIF.sedml']['report'])

h5py._hl.dataset.Dataset

In [116]:
fd5["XPP"]["aEIF.sedml"]['report'].attrs['uri']

'XPP/aEIF.sedml/report'

In [None]:
def mkDataFrames(project_id):
    """
    Creates dataframes from the h5 file for the project.
    
    Args:
        project_id
        
    Returns:
        list-DataFrame
            name = key
            columns variable names
    """
    # Check that the filedds all have the same structure
    dfs = []
    hd5_path = project.getH5FilePath()
    fd5 = h5py.File(hd5_path, 'r')
    # Find a sedml key
    keys = [k for k in fd5.keys() if ".sedml" in k]
    for key in key:
        datasets = fd5[key]
        names = []
        for dataset in datasets:
            if "report" in dataset.name:
                df = pd.DataFrame(dataset[:, :])
                df = df.T
        if selected_dataset is None:
            print ("***No report data set in %s" % (project.project_id))
            continue
        if not "sedmlDataSetIds" in selected_dataset.attrs.keys():
            print ("***No sedmlDataSetIds for %s/%s" % (project.project_id, selected_dataset.name))
            num_bad += 1
        else:
            #print(dataset.shape, len(dataset.attrs["sedmlDataSetIds"]))
            #print(dataset[:,:])
                
#

In [84]:
df = pd.DataFrame

In [89]:
df.name = "aaa"