# Model lineage using MLflow Registry and Neo4j data

In [6]:
# --- Configuration ---
MLFLOW_URI = "http://192.168.0.4:5000"   # MLFlow API service
RUN_IDS = [
    "b5bef0c50ae048658dc41d1b7a9c4eb2", # sample run ids - first one is a induced drift
    "909cd0309aaf43fc911591dbda0c81a3", # this one is a no-drift monitor run
]

# Neo4j (optional join)
NEO4J_URI  = "bolt://192.168.0.5:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "PotatoDTND12!"

print("Configured run_ids:", RUN_IDS)


Configured run_ids: ['b5bef0c50ae048658dc41d1b7a9c4eb2', '909cd0309aaf43fc911591dbda0c81a3']


In [7]:

# --- Imports ---
import os, json, pandas as pd, numpy as np
from mlflow.tracking import MlflowClient
import mlflow
from neo4j import GraphDatabase

# Respect env var if present
if "MLFLOW_TRACKING_URI" in os.environ and os.environ["MLFLOW_TRACKING_URI"]:
    mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
else:
    mlflow.set_tracking_uri(MLFLOW_URI)

client = MlflowClient()
print("MLflow Tracking URI:", mlflow.get_tracking_uri())

neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))


MLflow Tracking URI: http://192.168.0.4:5000


In [8]:

# --- Helpers ---
def get_run_info(run_id: str):
    try:
        run = client.get_run(run_id)
    except Exception as e:
        return {"run_id": run_id, "status": "not_found", "error": str(e)}
    info = run.info
    data = run.data
    return {
        "run_id": run_id,
        "status": "ok",
        "experiment_id": info.experiment_id,
        "run_name": info.run_name,
        "lifecycle_stage": info.lifecycle_stage,
        "start_time": info.start_time,
        "end_time": info.end_time,
        "artifact_uri": info.artifact_uri,
        "user_id": info.user_id,
        "tags": dict(data.tags),
        "params": dict(data.params),
        "metrics": dict(data.metrics),
    }

def get_registry_by_run(run_id: str):
    try:
        mvs = client.search_model_versions(f"run_id='{run_id}'")
    except Exception as e:
        return {"run_id": run_id, "model_name":"n/a", "model_version":"n/a", "stage":"n/a", "raw": [], "error": str(e)}
    if not mvs:
        return {"run_id": run_id, "model_name":"n/a", "model_version":"n/a", "stage":"n/a", "raw": []}
    with_stage = [mv for mv in mvs if (mv.current_stage or "")]
    chosen = with_stage[0] if with_stage else sorted(mvs, key=lambda x: int(x.version))[-1]
    return {
        "run_id": run_id,
        "model_name": chosen.name,
        "model_version": str(chosen.version),
        "stage": chosen.current_stage or "None",
        "raw": [dict(name=mv.name, version=str(mv.version), stage=mv.current_stage or "None") for mv in mvs],
    }

def get_all_model_versions(run_id: str):
    try:
        mvs = client.search_model_versions(f"run_id='{run_id}'")
    except Exception as e:
        return pd.DataFrame([], columns=["run_id","model_name","model_version","stage","error"]).assign(error=str(e))
    rows = []
    for mv in mvs:
        rows.append({"run_id": run_id, "model_name": mv.name, "model_version": str(mv.version), "stage": mv.current_stage or "None"})
    return pd.DataFrame(rows)

def df_from_kv(run_id: str, kind: str, d: dict):
    if not d:
        return pd.DataFrame([], columns=["run_id", "type", "key", "value"])
    return pd.DataFrame([{"run_id": run_id, "type": kind, "key": k, "value": v} for k,v in d.items()])

def neo4j_find_by_run_id(run_id: str):
    q = """
MATCH (n)
WHERE any(k IN keys(n) WHERE toString(n[k]) = $rid)
RETURN labels(n) AS labels, id(n) AS id, n
LIMIT 200
"""
    with neo4j_driver.session() as s:
        recs = s.run(q, rid=run_id)
        rows = []
        for r in recs:
            rows.append({"run_id": run_id, "labels": ":".join(r["labels"]), "node_id": r["id"], "properties": dict(r["n"])})
        return pd.DataFrame(rows)

def neo4j_targeted(run_id: str):
    q = """
CALL {
  MATCH (d:Drift)   WHERE d.mlflow_run_id = $rid RETURN 'Drift' AS label, id(d) AS id, d AS n
  UNION
  MATCH (s:Scoring) WHERE s.mlflow_run_id = $rid RETURN 'Scoring' AS label, id(s) AS id, s AS n
}
RETURN label, id, n
LIMIT 500
"""
    with neo4j_driver.session() as s:
        recs = s.run(q, rid=run_id)
        rows = []
        for r in recs:
            rows.append({"run_id": run_id, "label": r["label"], "node_id": r["id"], "properties": dict(r["n"])})
        return pd.DataFrame(rows)


In [9]:

# --- Pull everything ---
runs_info = []
models_pick = []
models_all = []
metrics_df = []
params_df = []
tags_df = []
neo_broad = []
neo_target = []

for rid in RUN_IDS:
    info = get_run_info(rid); runs_info.append(info)
    models_pick.append(get_registry_by_run(rid))
    mv_all = get_all_model_versions(rid); 
    if len(mv_all): models_all.append(mv_all)

    if info.get("status") == "ok":
        metrics_df.append(df_from_kv(rid, "metric", info["metrics"]))
        params_df.append(df_from_kv(rid, "param", info["params"]))
        tags_df.append(df_from_kv(rid, "tag", info["tags"]))

    neo_broad.append(neo4j_find_by_run_id(rid))
    neo_target.append(neo4j_targeted(rid))

runs_df = pd.json_normalize(runs_info)
models_pick_df = pd.json_normalize(models_pick)
models_all_df = pd.concat(models_all, ignore_index=True) if models_all else pd.DataFrame(columns=["run_id","model_name","model_version","stage"])
metrics_df = pd.concat(metrics_df, ignore_index=True) if metrics_df else pd.DataFrame(columns=["run_id","type","key","value"])
params_df  = pd.concat(params_df,  ignore_index=True) if params_df  else pd.DataFrame(columns=["run_id","type","key","value"])
tags_df    = pd.concat(tags_df,    ignore_index=True) if tags_df    else pd.DataFrame(columns=["run_id","type","key","value"])
neo_broad_df  = pd.concat(neo_broad,  ignore_index=True) if neo_broad  else pd.DataFrame(columns=["run_id","labels","node_id","properties"])
neo_target_df = pd.concat(neo_target, ignore_index=True) if neo_target else pd.DataFrame(columns=["run_id","label","node_id","properties"])

print("Done. Rows — runs:", len(runs_df), " pick:", len(models_pick_df), " all_versions:", len(models_all_df))




Done. Rows — runs: 2  pick: 2  all_versions: 0


In [10]:

# Display
from IPython.display import display
print("=== Overview — Runs ==="); display(runs_df.fillna(""))
print("=== Registry — Chosen (by stage/most recent) ==="); display(models_pick_df.fillna(""))
print("=== Registry — All Versions ==="); display(models_all_df.fillna(""))
print("=== Metrics ==="); display(metrics_df.sort_values(["run_id","key"]).reset_index(drop=True))
print("=== Params ===");  display(params_df.sort_values(["run_id","key"]).reset_index(drop=True))
print("=== Tags ===");    display(tags_df.sort_values(["run_id","key"]).reset_index(drop=True))
print("=== Neo4j — Broad scan ==="); display(neo_broad_df.head(200))
print("=== Neo4j — Targeted (:Drift/:Scoring) ==="); display(neo_target_df.head(200))


=== Overview — Runs ===


Unnamed: 0,run_id,status,experiment_id,run_name,lifecycle_stage,start_time,end_time,artifact_uri,user_id,tags.mlflow.user,...,tags.registered_model_name,tags.registered_model_version,tags.registered_model_stage,tags.registered_model_matched_at,tags.registered_model_match_rule,metrics.amount_drift_divergence,metrics.from_pagerank_drift_divergence,metrics.to_pagerank_drift_divergence,metrics.from_centrality_drift_divergence,metrics.to_centrality_drift_divergence
0,b5bef0c50ae048658dc41d1b7a9c4eb2,ok,3,drift_check_1745426672,active,1745426672596,1745427867205,/var/lib/mlflow/artifacts/3/b5bef0c50ae048658d...,root,root,...,isolation_forest_model,9,Archived,1760845858338,approx_time,1.938317,1.671995,1.663804,1.533817,1.617337
1,909cd0309aaf43fc911591dbda0c81a3,ok,3,drift_check_1744848323,active,1745383063779,1745383065662,/var/lib/mlflow/artifacts/3/909cd0309aaf43fc91...,root,root,...,isolation_forest_model,7,Archived,1760845858399,approx_time,0.000533,0.029624,0.030708,0.057445,0.048


=== Registry — Chosen (by stage/most recent) ===


Unnamed: 0,run_id,model_name,model_version,stage,raw
0,b5bef0c50ae048658dc41d1b7a9c4eb2,,,,[]
1,909cd0309aaf43fc911591dbda0c81a3,,,,[]


=== Registry — All Versions ===


Unnamed: 0,run_id,model_name,model_version,stage


=== Metrics ===


Unnamed: 0,run_id,type,key,value
0,909cd0309aaf43fc911591dbda0c81a3,metric,amount_drift_divergence,0.000533
1,909cd0309aaf43fc911591dbda0c81a3,metric,from_centrality_drift_divergence,0.057445
2,909cd0309aaf43fc911591dbda0c81a3,metric,from_pagerank_drift_divergence,0.029624
3,909cd0309aaf43fc911591dbda0c81a3,metric,to_centrality_drift_divergence,0.048
4,909cd0309aaf43fc911591dbda0c81a3,metric,to_pagerank_drift_divergence,0.030708
5,b5bef0c50ae048658dc41d1b7a9c4eb2,metric,amount_drift_divergence,1.938317
6,b5bef0c50ae048658dc41d1b7a9c4eb2,metric,from_centrality_drift_divergence,1.533817
7,b5bef0c50ae048658dc41d1b7a9c4eb2,metric,from_pagerank_drift_divergence,1.671995
8,b5bef0c50ae048658dc41d1b7a9c4eb2,metric,to_centrality_drift_divergence,1.617337
9,b5bef0c50ae048658dc41d1b7a9c4eb2,metric,to_pagerank_drift_divergence,1.663804


=== Params ===


Unnamed: 0,run_id,type,key,value


=== Tags ===


Unnamed: 0,run_id,type,key,value
0,909cd0309aaf43fc911591dbda0c81a3,tag,mlflow.runName,drift_check_1744848323
1,909cd0309aaf43fc911591dbda0c81a3,tag,mlflow.source.name,monitor.py
2,909cd0309aaf43fc911591dbda0c81a3,tag,mlflow.source.type,LOCAL
3,909cd0309aaf43fc911591dbda0c81a3,tag,mlflow.user,root
4,909cd0309aaf43fc911591dbda0c81a3,tag,registered_model_match_rule,approx_time
5,909cd0309aaf43fc911591dbda0c81a3,tag,registered_model_matched_at,1760845858399
6,909cd0309aaf43fc911591dbda0c81a3,tag,registered_model_name,isolation_forest_model
7,909cd0309aaf43fc911591dbda0c81a3,tag,registered_model_stage,Archived
8,909cd0309aaf43fc911591dbda0c81a3,tag,registered_model_version,7
9,b5bef0c50ae048658dc41d1b7a9c4eb2,tag,drift_detected,True


=== Neo4j — Broad scan ===


Unnamed: 0,run_id,labels,node_id,properties
0,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628994,"{'is_drift_test': True, 'detected_at': 2025-04..."
1,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628995,"{'is_drift_test': True, 'detected_at': 2025-04..."
2,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628996,"{'is_drift_test': True, 'detected_at': 2025-04..."
3,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628997,"{'is_drift_test': True, 'detected_at': 2025-04..."
4,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628998,"{'is_drift_test': True, 'detected_at': 2025-04..."


=== Neo4j — Targeted (:Drift/:Scoring) ===


Unnamed: 0,run_id,label,node_id,properties
0,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628994,"{'is_drift_test': True, 'detected_at': 2025-04..."
1,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628995,"{'is_drift_test': True, 'detected_at': 2025-04..."
2,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628996,"{'is_drift_test': True, 'detected_at': 2025-04..."
3,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628997,"{'is_drift_test': True, 'detected_at': 2025-04..."
4,b5bef0c50ae048658dc41d1b7a9c4eb2,Drift,1628998,"{'is_drift_test': True, 'detected_at': 2025-04..."
