### This Jupyter notebook provides examples of how the pipeline metadata stored with CMF tracking layer can be queried with CMF query layer.

In [1]:
import pandas as pd
from cmflib import cmfquery
from cmflib.merger import parse_json_to_mlmd

##### Initialize the library and get all the stages in the pipeline
Point the library to the metadata file. <br>
The `get_pipeline_stages` call point to the different stages in the pipeline.

In [2]:
query = cmfquery.CmfQuery("./mlmd")
json_str = query.dumptojson("Test-env")
#print(stages)

##### Query the Executions in each stage

In [3]:
print(json_str)

{"Pipeline": [{"create_time_since_epoch": 1666018207959, "custom_properties": {}, "id": 1, "last_update_time_since_epoch": 1666018207959, "name": "Test-env", "properties": {"Pipeline": "Test-env"}, "type": "", "type_id": 10, "stages": [{"create_time_since_epoch": 1666018207960, "custom_properties": {"user-metadata1": "metadata_value"}, "id": 2, "last_update_time_since_epoch": 1666018207960, "name": "Prepare", "properties": {"Pipeline_Stage": "Prepare"}, "type": "", "type_id": 11, "executions": [{"create_time_since_epoch": 1666018207974, "custom_properties": {"split": 0.2, "seed": 20170428}, "id": 1, "last_update_time_since_epoch": 1666018207974, "name": "", "properties": {"Context_ID": 2, "Context_Type": "Prepare", "Pipeline_id": 1, "Git_Start_Commit": "e68201001d9431895a32dbd87409e4eac0a2fd43", "Pipeline_Type": "Test-env", "Git_Repo": "/tmp/cmf/example_get_started/git_remote", "Execution": "['src/parse.py', 'artifacts/data.xml.gz', 'artifacts/parsed']", "Git_End_Commit": ""}, "type": 




In [4]:
import time
start = time.time()
parse_json_to_mlmd(json_str)
end =  time.time()
print(end - start)
#executions = query.get_all_executions_in_stage('Prepare')
#print(executions)

<class 'list'>
<class 'dict'>
<class 'str'>
Prepare
Prepare
/tmp/cmf/example_get_started/git_remote/artifacts/data.xml.gz
/tmp/cmf/example_get_started/git_remote/artifacts/parsed/train.tsv
/tmp/cmf/example_get_started/git_remote/artifacts/parsed/test.tsv
Featurize
Featurize-execution
/tmp/cmf/example_get_started/git_remote/artifacts/parsed/train.tsv
/tmp/cmf/example_get_started/git_remote/artifacts/parsed/test.tsv
/tmp/cmf/example_get_started/git_remote/artifacts/features/train.pkl
/tmp/cmf/example_get_started/git_remote/artifacts/features/test.pkl
Train
Train-execution
/tmp/cmf/example_get_started/git_remote/artifacts/features/train.pkl
SKlearn
RandomForestClassifier
RandomForestClassifier:default
artifacts/model/model.pkl
<class 'str'>
9351b160e2e355b1412633cdfb0460c7
Evaluate
Evaluate-execution
SKlearn
RandomForestClassifier
RandomForestClassifier:default
9351b160e2e355b1412633cdfb0460c7
/tmp/cmf/example_get_started/git_remote/artifacts/features/test.pkl
0.1870732307434082


##### Query the Executions in each stage


In [5]:
executions = query.get_all_executions_in_stage('Featurize')
print(executions)

   Context_ID         Context_Type  \
0           3  Featurize-execution   
1           3  Featurize-execution   

                                           Execution Git_End_Commit  \
0  ['/home/royann/env/lib/python3.6/site-packages...                  
1  ['src/featurize.py', 'artifacts/parsed', 'arti...                  

                                  Git_Repo  \
0  /tmp/cmf/example_get_started/git_remote   
1  /tmp/cmf/example_get_started/git_remote   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   6   
1  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   2   

   max_features  ngrams  
0          3000       2  
1          3000       2  


##### Query the Executions in each stage

In [6]:
executions = query.get_all_executions_in_stage('Train')
print(executions)

   Context_ID     Context_Type  \
0           4  Train-execution   
1           4  Train-execution   

                                           Execution Git_End_Commit  \
0  ['/home/royann/env/lib/python3.6/site-packages...                  
1  ['src/train.py', 'artifacts/features', 'artifa...                  

                                  Git_Repo  \
0  /tmp/cmf/example_get_started/git_remote   
1  /tmp/cmf/example_get_started/git_remote   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   7   
1  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   3   

   min_split  n_est      seed  
0         64    100  20170428  
1         64    100  20170428  


##### Query the Executions in each stage

In [7]:
executions = query.get_all_executions_in_stage('Evaluate')
print(executions)

   Context_ID        Context_Type  \
0           5  Evaluate-execution   
1           5  Evaluate-execution   

                                           Execution Git_End_Commit  \
0  ['/home/royann/env/lib/python3.6/site-packages...                  
1  ['src/test.py', 'artifacts/model', 'artifacts/...                  

                                  Git_Repo  \
0  /tmp/cmf/example_get_started/git_remote   
1  /tmp/cmf/example_get_started/git_remote   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  
0  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   8  
1  e68201001d9431895a32dbd87409e4eac0a2fd43      Test-env            1   4  


##### Get all the artifacts of execution. 
<b>input parameter - execution_id</b><br>
<b>output parameter - artifacts</b><br>


In [8]:
artifacts = query.get_all_artifacts_for_execution(8)
print(artifacts)

                                              Commit  avg_prec commit  \
0  To track the changes with git, run:\n\n\tgit a...       NaN          
1  To track the changes with git, run:\n\n\tgit a...       NaN    NaN   
2                                                NaN  0.604054    NaN   

   create_time_since_epoch   event                                 git_repo  \
0            1666018289121   INPUT                                      NaN   
1            1666018240298   INPUT  /tmp/cmf/example_get_started/git_remote   
2            1666018289152  OUTPUT                                      NaN   

   id  last_update_time_since_epoch  \
0   8                 1666018289121   
1   5                 1666018240298   
2   9                 1666018289152   

                                     metrics_name model_framework  \
0                                             NaN         SKlearn   
1                                             NaN             NaN   
2  metrics:2ea59dbc-4e2b-1

In [9]:
artifacts = query.get_all_artifacts_for_execution(4)
print(artifacts)

                                              Commit  avg_prec  \
0  To track the changes with git, run:\n\n\tgit a...       NaN   
1  To track the changes with git, run:\n\n\tgit a...       NaN   
2                                                NaN  0.604054   

   create_time_since_epoch   event                                 git_repo  \
0            1666018248751   INPUT                                      NaN   
1            1666018240298   INPUT  /tmp/cmf/example_get_started/git_remote   
2            1666018253989  OUTPUT                                      NaN   

   id  last_update_time_since_epoch  \
0   6                 1666018248751   
1   5                 1666018240298   
2   7                 1666018253989   

                                     metrics_name model_framework  \
0                                             NaN         SKlearn   
1                                             NaN             NaN   
2  metrics:19b022ce-4e2b-11ed-99a3-b47af137252e:4     

In [10]:
# print(pd.options.display.max_colwidth)
pd.options.display.max_colwidth = 75
artifacts = query.get_all_artifacts_for_execution(2)

# print(artifacts)
print(artifacts["name"])
print(artifacts["event"])

0      artifacts/parsed/train.tsv:32b715ef0d71ff4c9e61f55b09c15e75
1       artifacts/parsed/test.tsv:6f597d341ceb7d8fbbe88859a892ef81
2    artifacts/features/train.pkl:c565b23737962d61ccf1122cb211fc37
3     artifacts/features/test.pkl:96af3114a0c204043ded8c419eab8dbe
Name: name, dtype: object
0     INPUT
1     INPUT
2    OUTPUT
3    OUTPUT
Name: event, dtype: object


#### get all executions for an artifact(pass the artifact full name as the input parameter)

In [11]:
linked = query.get_all_executions_for_artifact("artifacts/data.xml.gz:a304afb96060aad90176268345e10355")
print(linked)

Empty DataFrame
Columns: []
Index: []


#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/features/test.pkl")
print(linked)

#### Get all the child artifacts of an artifact. Provides the lineage chain in the downstream direction

In [None]:
linked = query.get_all_child_artifacts("artifacts/features/train.pkl")

print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/model/model.pkl")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_parent_artifacts("artifacts/parsed/test.tsv")
print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/parsed/test.tsv")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

#### Get immediate child artifacts of an artifact. 

In [None]:
linked = query.get_one_hop_child_artifacts("artifacts/data.xml.gz")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/data.xml.gz")
#print(linked.sort_values('create_time_since_epoch', ascending=True))
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_artifacts_for_execution(4)
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

### Change the metrics name in the get_artifact call with the metrics name from output of the previous cell

In [None]:
artifacts = query.get_artifact("metrics:aaae534e-915d-11ec-b106-89841b9859cd:4")
print(artifacts)


In [None]:
new_parquet_df = pd.read_parquet("./slice-a")
print(new_parquet_df)