# 07 Visualize Lineage

This notebook scans the log files to build and visualize a Directed Acyclic Graph (DAG) of the entire data science pipeline.

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import re
from graphviz import Digraph
from ds_logger import start_logging, end_logging

notebook_description = "Visualizes the project's data and notebook lineage as a DAG."
start_logging(notebook_name='07_visualize_lineage.ipynb', notebook_description=notebook_description)

In [2]:
def parse_logs_for_lineage(log_dir):
    edges = set()
    log_files = [os.path.join(log_dir, f) for f in os.listdir(log_dir) if f.endswith('.log')]
    
    for log_file in log_files:
        notebook_name = os.path.basename(log_file).split('_')[0] + '.ipynb'
        with open(log_file, 'r') as f:
            for line in f:
                if 'FILE_READ' in line:
                    match = re.search(r'FILE_READ: (.*)', line)
                    if match:
                        filepath = os.path.basename(match.group(1))
                        edges.add((filepath, notebook_name))
                elif 'FILE_WRITTEN' in line:
                    match = re.search(r'FILE_WRITTEN: (.*)', line)
                    if match:
                        filepath = os.path.basename(match.group(1))
                        edges.add((notebook_name, filepath))
    return edges

def visualize_lineage(edges, output_path):
    dot = Digraph(comment='Data Science Pipeline')
    nodes = set()
    for start, end in edges:
        nodes.add(start)
        nodes.add(end)
    
    for node in nodes:
        if '.ipynb' in node:
            dot.node(node, shape='box', style='rounded,filled', fillcolor='lightblue')
        else:
            dot.node(node, shape='ellipse', style='filled', fillcolor='lightgrey')
            
    for start, end in edges:
        dot.edge(start, end)
        
    dot.render(output_path, format='png', cleanup=True)
    print(f'Lineage graph saved to {output_path}.png')

log_directory = '../logs'
lineage_edges = parse_logs_for_lineage(log_directory)
graph_output_path = '../reports/pipeline_lineage'
visualize_lineage(lineage_edges, graph_output_path)

Lineage graph saved to ../reports/pipeline_lineage.png


In [3]:
end_logging(results={'graph_generated': 'pipeline_lineage.png'})