### This notebook provides some examples of how the metadata stored can be queried.

In [1]:
import pandas as pd
from datetime import datetime
from uuid import uuid4
from cmflib import cmfquery

##### Initilaize the library and get all the stages in the pipeline
Point the library to the metadata file. <br>
get_pipeline_stages call point to the different stages in the pipeline

In [2]:
query = cmfquery.CmfQuery("/home/royann/example-get-started/mlmd")
stages = query.get_pipeline_stages("Test-env")
print(stages)

['Prepare', 'Featurize', 'Train', 'Evaluate']


##### Query the Executions in each stage

In [3]:
executions = query.get_all_executions_in_stage('Prepare')
print(executions)

<ml_metadata.metadata_store.metadata_store.MetadataStore object at 0x7f82b0c6abe0>
   Context_ID Context_Type                            Execution  \
0           2      Prepare  ['src/prepare.py', 'data/data.xml']   

  Git_End_Commit                                         Git_Repo  \
0                 git@github.hpe.com:annmary-roy/crop-dataset.git   

  Git_Start_Commit Pipeline_Type  Pipeline_id  id  user-metadata1  
0                       Test-env            1   1  metadata_value  


##### Query the Executions in each stage


In [4]:
executions = query.get_all_executions_in_stage('Featurize')
print(executions)

<ml_metadata.metadata_store.metadata_store.MetadataStore object at 0x7f82b0c6abe0>
   Context_ID         Context_Type  \
0           3  Featurize-execution   

                                           Execution Git_End_Commit  \
0  ['src/featurization.py', 'data/prepared', 'dat...                  

                                          Git_Repo Git_Start_Commit  \
0  git@github.hpe.com:annmary-roy/crop-dataset.git                    

  Pipeline_Type  Pipeline_id  id  
0      Test-env            1   2  


In [5]:
##### Query the Executions in each stage

In [6]:
executions = query.get_all_executions_in_stage('Train')
print(executions)

<ml_metadata.metadata_store.metadata_store.MetadataStore object at 0x7f82b0c6abe0>
   Context_ID     Context_Type  \
0           4  Train-execution   

                                        Execution Git_End_Commit  \
0  ['src/train.py', 'data/features', 'model.pkl']                  

                                          Git_Repo Git_Start_Commit  \
0  git@github.hpe.com:annmary-roy/crop-dataset.git                    

  Pipeline_Type  Pipeline_id  id  
0      Test-env            1   3  


##### Query the Executions in each stage

In [7]:
executions = query.get_all_executions_in_stage('Evaluate')
print(executions)

<ml_metadata.metadata_store.metadata_store.MetadataStore object at 0x7f82b0c6abe0>
   Context_ID        Context_Type  \
0           5  Evaluate-execution   

                                           Execution Git_End_Commit  \
0  ['src/evaluate.py', 'model.pkl', 'data/feature...                  

                                          Git_Repo Git_Start_Commit  \
0  git@github.hpe.com:annmary-roy/crop-dataset.git                    

  Pipeline_Type  Pipeline_id  id  
0      Test-env            1   4  


##### Get all the artifacts of execution. 
<b>input parameter - execution_id</b><br>
<b>output parameter - artifacts</b><br>


In [8]:
artifacts = query.get_all_artifacts_for_execution(1)
print(artifacts)

                                            Commit  \
0  commit d17409c492cc3631ac079499a6b9a2f81f34d04a   
1  commit 793d32d05844d13d18e6749b2f3202b64f4112cf   
2  commit a26b4dfff249bfa00bc6ef5a5f290345f5eaddb7   
3  commit 80e9ef9db02db5bf07f7d1393afa50991317d9bc   

                                            Remote  create_time_since_epoch  \
0                                              NaN            1645259153600   
1  /tmp/myremote/89/3d3766ae7b0951e6bac6d83a5d9c58            1645259155826   
2                                              NaN            1645259160057   
3                                              NaN            1645259161119   

    event                                         git_repo  id  \
0   INPUT  git@github.hpe.com:annmary-roy/crop-dataset.git   1   
1  OUTPUT  git@github.hpe.com:annmary-roy/crop-dataset.git   2   
2  OUTPUT  git@github.hpe.com:annmary-roy/crop-dataset.git   3   
3  OUTPUT  git@github.hpe.com:annmary-roy/crop-dataset.git   4   

  

In [9]:
#print(pd.options.display.max_colwidth)
pd.options.display.max_colwidth = 75
artifacts = query.get_all_artifacts_for_execution(2)
#print(artifacts)
print(artifacts["name"])
print(artifacts["event"])

0    data/prepared/train.tsv:15316092c623e119923f797ff16976d4
1     data/prepared/test.tsv:429fa355ff389d290b5ededb39f7106c
2    data/features/train.pkl:300fa23207dd4af0fe77e1a1b33eb6e1
3     data/features/test.pkl:16de98c80d596309179763e87eb49bd4
Name: name, dtype: object
0     INPUT
1     INPUT
2    OUTPUT
3    OUTPUT
Name: event, dtype: object


#### get all executions for an artifact(pass the artifact full name as the input parameter)

In [10]:
linked = query.get_all_executions_for_artifact("data/data.xml:a304afb96060aad90176268345e10355")
print(linked)

    Type  execution_id execution_name  pipeline    stage
0  INPUT             1                 Test-env  Prepare


#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [11]:
linked = query.get_all_parent_artifacts("data/features/test.pkl")
print(linked)

                                            Commit  create_time_since_epoch  \
0  commit a26b4dfff249bfa00bc6ef5a5f290345f5eaddb7            1645259160057   
1  commit 80e9ef9db02db5bf07f7d1393afa50991317d9bc            1645259161119   
2  commit d17409c492cc3631ac079499a6b9a2f81f34d04a            1645259153600   

                                          git_repo  id  \
0  git@github.hpe.com:annmary-roy/crop-dataset.git   3   
1  git@github.hpe.com:annmary-roy/crop-dataset.git   4   
2  git@github.hpe.com:annmary-roy/crop-dataset.git   1   

   last_update_time_since_epoch  \
0                 1645259160057   
1                 1645259161119   
2                 1645259153600   

                                                       name     type  \
0  data/prepared/train.tsv:15316092c623e119923f797ff16976d4  Dataset   
1   data/prepared/test.tsv:429fa355ff389d290b5ededb39f7106c  Dataset   
2            data/data.xml:a304afb96060aad90176268345e10355  Dataset   

                    

#### Get all the child artifacts of an artifact. Provides the lineage chain in the downstream direction

In [12]:
linked = query.get_all_child_artifacts("data/features/train.pkl")

print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

Name :   model.pkl:19834dbd1c609a4d4fa188a6cae92944:3
metrics:aaae534e-915d-11ec-b106-89841b9859cd:4
Type :   Model
Metrics
URI :     19834dbd1c609a4d4fa188a6cae92944
aaae534e-915d-11ec-b106-89841b9859cd


In [13]:
#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [14]:
linked = query.get_all_parent_artifacts("model.pkl")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

NAME
data/features/train.pkl:300fa23207dd4af0fe77e1a1b33eb6e1
data/prepared/train.tsv:15316092c623e119923f797ff16976d4
 data/prepared/test.tsv:429fa355ff389d290b5ededb39f7106c
          data/data.xml:a304afb96060aad90176268345e10355
TYPE
Dataset
Dataset
Dataset
Dataset
URI
300fa23207dd4af0fe77e1a1b33eb6e1
15316092c623e119923f797ff16976d4
429fa355ff389d290b5ededb39f7106c
a304afb96060aad90176268345e10355


In [15]:
linked = query.get_all_parent_artifacts("data/prepared/test.tsv")
print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

Name : data/data.xml:a304afb96060aad90176268345e10355
Type : Dataset
URI : a304afb96060aad90176268345e10355


In [16]:
linked = query.get_all_child_artifacts("data/prepared/test.tsv")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

NAME
data/features/train.pkl:300fa23207dd4af0fe77e1a1b33eb6e1
 data/features/test.pkl:16de98c80d596309179763e87eb49bd4
            model.pkl:19834dbd1c609a4d4fa188a6cae92944:3
          metrics:aaae534e-915d-11ec-b106-89841b9859cd:4
TYPE
Dataset
Dataset
  Model
Metrics
URI
    300fa23207dd4af0fe77e1a1b33eb6e1
    16de98c80d596309179763e87eb49bd4
    19834dbd1c609a4d4fa188a6cae92944
aaae534e-915d-11ec-b106-89841b9859cd


#### Get immediate child artifacts of an artifact. 

In [17]:
linked = query.get_one_hop_child_artifacts("data/data.xml")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

NAME
                slice-a:893d3766ae7b0951e6bac6d83a5d9c58
data/prepared/train.tsv:15316092c623e119923f797ff16976d4
 data/prepared/test.tsv:429fa355ff389d290b5ededb39f7106c
TYPE
Dataslice
  Dataset
  Dataset
URI
893d3766ae7b0951e6bac6d83a5d9c58
15316092c623e119923f797ff16976d4
429fa355ff389d290b5ededb39f7106c


In [18]:
linked = query.get_all_child_artifacts("data/data.xml")
#print(linked.sort_values('create_time_since_epoch', ascending=True))
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

NAME
                slice-a:893d3766ae7b0951e6bac6d83a5d9c58
data/prepared/train.tsv:15316092c623e119923f797ff16976d4
 data/prepared/test.tsv:429fa355ff389d290b5ededb39f7106c
data/features/train.pkl:300fa23207dd4af0fe77e1a1b33eb6e1
 data/features/test.pkl:16de98c80d596309179763e87eb49bd4
            model.pkl:19834dbd1c609a4d4fa188a6cae92944:3
          metrics:aaae534e-915d-11ec-b106-89841b9859cd:4
TYPE
Dataslice
  Dataset
  Dataset
  Dataset
  Dataset
    Model
  Metrics
URI
    893d3766ae7b0951e6bac6d83a5d9c58
    15316092c623e119923f797ff16976d4
    429fa355ff389d290b5ededb39f7106c
    300fa23207dd4af0fe77e1a1b33eb6e1
    16de98c80d596309179763e87eb49bd4
    19834dbd1c609a4d4fa188a6cae92944
aaae534e-915d-11ec-b106-89841b9859cd


In [19]:
linked = query.get_all_artifacts_for_execution(4)
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

NAME
  model.pkl:19834dbd1c609a4d4fa188a6cae92944:3
metrics:aaae534e-915d-11ec-b106-89841b9859cd:4
TYPE
  Model
Metrics
URI
    19834dbd1c609a4d4fa188a6cae92944
aaae534e-915d-11ec-b106-89841b9859cd


### Change the metrics name in the get_artifact call with the metrics name from output of the previous cell

In [22]:
artifacts = query.get_artifact("metrics:aaae534e-915d-11ec-b106-89841b9859cd:4")
print(artifacts)


   id     type                                   uri  \
0   8  Metrics  aaae534e-915d-11ec-b106-89841b9859cd   

                                             name  create_time_since_epoch  \
0  metrics:aaae534e-915d-11ec-b106-89841b9859cd:4            1645259202323   

   last_update_time_since_epoch  \
0                 1645259202323   

                                     metrics_name  avg_prec   roc_auc  
0  metrics:aaae534e-915d-11ec-b106-89841b9859cd:4  0.545596  0.958625  


In [23]:
new_parquet_df = pd.read_parquet("/home/royann/example-get-started/slice-a")
print(new_parquet_df)

                                                  hash
Path                                                  
data/raw_data/1.xml   9f2519f71190d545eb23ba0d054db089
data/raw_data/18.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/19.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/21.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/27.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/35.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/37.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/42.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/44.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/47.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/58.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/60.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/63.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/66.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/67.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_data/75.xml  f257379463d3165ef51f0c80f50e6a7c
data/raw_d