In [8]:
# generate_flowchart.py

from graphviz import Digraph

def create_flowchart(output_filename='WhtedMinHash_FlowChart', file_format='png'):
    """
    Creates annotated flowchart for the genome sequencing and read mapping process.
    
    Args:
        output_filename (str): The name of the output file without extension.
        file_format (str): The format of the output file (e.g., 'png', 'pdf').
    
    Returns:
        None
    """
    # Initialize a directed graph
    dot = Digraph(comment='Genome Sequencing and Read Mapping Workflow', format=file_format)
    
    # Define node styles
    node_attrs = {
        'shape': 'box',
        'style': 'rounded,filled',
        'color': 'lightblue2',
        'fontname': 'Helvetica',
        'fontsize': '10'
    }
    
    # Define decision node style
    decision_attrs = {
        'shape': 'diamond',
        'style': 'filled',
        'color': 'lightgoldenrod',
        'fontname': 'Helvetica',
        'fontsize': '10'
    }
    
    # Define start and end node styles
    start_end_attrs = {
        'shape': 'oval',
        'style': 'filled',
        'color': 'lightgreen',
        'fontname': 'Helvetica',
        'fontsize': '10'
    }
    
    # Add nodes with explanations and input variable descriptions
    dot.node('Start', 'Start', **start_end_attrs)
    dot.node('LoadGenome', 'Load Genome Sequence\n(Input: genome_path, file_fmt)', **node_attrs)
    dot.node('GenKmersGenome', 'Generate k-mers from Genome\n(Input: genome_seq, k_size, k_stride, include_revcom)', **node_attrs)
    dot.node('CreateHash', 'Create Hash Table\n(Input: kmers from genome)', **node_attrs)
    dot.node('MinimizeHash', 'Minimize Hash Table\n(Action: Remove overrepresented k-mers)', **node_attrs)
    dot.node('LoadReads', 'Load Reads and Ground Truth Data\n(Input: fastq_path, ground_truth_file)', **node_attrs)
    dot.node('IterateReads', 'Iterate Over Each Read\n(Action: Process each read)', **node_attrs)

    # Sub-steps for iterating over each read
    dot.node('GenKmersRead', 'Generate k-mers from Read\n(Input: read_seq, k_size, k_stride, include_revcom)', **node_attrs)
    dot.node('CountFreq', 'Count k-mer Frequencies\n(Action: Frequency count of k-mers)', **node_attrs)
    dot.node('GenSketch', 'Generate Weighted MinHash Sketch\n(Input: kmer_freq, WeightedMinHash instance)', **node_attrs)
    dot.node('CoarseMap', 'Coarse Mapping\n(Input: sketch_read, genome_hash_table)', **node_attrs)
    dot.node('CheckMatch', 'Check for Matches\n(Action: Validate mapping)', **decision_attrs)
    dot.node('LogResults', 'Log Results\n(Action: Record mapping status)', **node_attrs)

    # End node
    dot.node('End', 'End', **start_end_attrs)
    
    # Add edges with labels indicating flow
    dot.edge('Start', 'LoadGenome')
    dot.edge('LoadGenome', 'GenKmersGenome')
    dot.edge('GenKmersGenome', 'CreateHash')
    dot.edge('CreateHash', 'MinimizeHash')
    dot.edge('MinimizeHash', 'LoadReads')
    dot.edge('LoadReads', 'IterateReads')
    
    # Edges for iterating over each read
    dot.edge('IterateReads', 'GenKmersRead')
    dot.edge('GenKmersRead', 'CountFreq')
    dot.edge('CountFreq', 'GenSketch')
    dot.edge('GenSketch', 'CoarseMap')
    dot.edge('CoarseMap', 'CheckMatch')
    
    # Decision: If match found, log results; else, continue iterating
    dot.edge('CheckMatch', 'LogResults', label='Match Found', color='green')
    dot.edge('CheckMatch', 'IterateReads', label='No Match', color='red')
    
    # After logging, iterate to next read
    dot.edge('LogResults', 'IterateReads')
    
    # After all reads are processed, go to End
    dot.edge('IterateReads', 'End', label='All Reads Processed', style='dashed')
    
    # Render the flowchart
    dot.render(output_filename, view=True)
    print(f"Flowchart saved as {output_filename}.{file_format}")

if __name__ == "__main__":
    create_flowchart()


Flowchart saved as WhtedMinHash_FlowChart.png
