# MKrep: Microbial Genomics Analysis Dashboard

**Interactive analysis tool for antimicrobial resistance and virulence profiling**

This dashboard provides a user-friendly interface for analyzing bacterial genomic data without programming knowledge.

In [None]:
# Import required libraries
import os
import sys
import io
import base64
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output, Javascript

# Import analysis modules (we'll import them dynamically later)
# These are imported from the main repository scripts

print("‚úì Libraries loaded successfully")

In [None]:
# Configuration and Global Variables
ANALYSIS_TYPES = {
    'cluster': {
        'name': 'Cluster Analysis',
        'description': 'K-Modes clustering for MIC, AMR, and virulence data',
        'required_files': ['MIC.csv', 'AMR_genes.csv', 'Virulence.csv'],
        'optional_files': [],
        'script': 'Cluster_MIC_AMR_Viruelnce.py'
    },
    'mdr': {
        'name': 'MDR Analysis',
        'description': 'Multi-drug resistance pattern analysis with network approach',
        'required_files': ['resistance_data.csv'],
        'optional_files': [],
        'script': 'MDR_2025_04_15.py'
    },
    'network': {
        'name': 'Network Analysis',
        'description': 'Statistical network analysis of feature associations',
        'required_files': ['MGE.csv', 'MIC.csv', 'MLST.csv', 'Plasmid.csv', 'Serotype.csv', 'Virulence.csv', 'AMR_genes.csv'],
        'optional_files': [],
        'script': 'Network_Analysis_2025_06_26.py'
    },
    'phylo': {
        'name': 'Phylogenetic Clustering',
        'description': 'Tree-based clustering with binary trait analysis',
        'required_files': ['tree.newick', 'MIC.csv', 'AMR_genes.csv', 'Virulence.csv'],
        'optional_files': ['MLST.csv', 'Serotype.csv'],
        'script': 'Phylgenetic_clustering_2025_03_21.py'
    },
    'strepsuis': {
        'name': 'Streptococcus suis Analysis',
        'description': 'Specialized analysis for S. suis with phylogenetic integration',
        'required_files': ['tree.newick', 'MIC.csv', 'AMR_genes.csv', 'Virulence.csv'],
        'optional_files': ['MLST.csv', 'Serotype.csv'],
        'script': 'StrepSuisPhyloCluster_2025_08_11.py'
    }
}

# Global storage for uploaded files
uploaded_files = {}
output_folder = "dashboard_output"
os.makedirs(output_folder, exist_ok=True)

print("‚úì Configuration loaded")

In [None]:
# Helper Functions

def validate_binary_data(df, filename):
    """Validate that data is binary (0 or 1)"""
    errors = []
    
    # Check for Strain_ID column
    if 'Strain_ID' not in df.columns:
        errors.append(f"{filename}: Missing 'Strain_ID' column")
        return False, errors
    
    # Check binary values (excluding Strain_ID)
    data_cols = [col for col in df.columns if col != 'Strain_ID']
    if len(data_cols) > 0:
        non_binary = df[data_cols].apply(lambda col: ~col.isin([0, 1, '0', '1'])).any()
        if non_binary.any():
            bad_cols = non_binary[non_binary].index.tolist()
            errors.append(f"{filename}: Non-binary values in columns: {', '.join(bad_cols[:5])}")
            return False, errors
    
    return True, []

def save_uploaded_file(file_widget, filename):
    """Save uploaded file to disk and validate"""
    try:
        # Get file content
        content = file_widget[filename]['content']
        
        # Save to disk
        filepath = os.path.join(output_folder, filename)
        with open(filepath, 'wb') as f:
            f.write(content)
        
        # Validate if CSV
        if filename.endswith('.csv'):
            df = pd.read_csv(filepath)
            is_valid, errors = validate_binary_data(df, filename)
            if not is_valid:
                return False, errors
        
        uploaded_files[filename] = filepath
        return True, []
    
    except Exception as e:
        return False, [f"Error processing {filename}: {str(e)}"]

def create_download_link(filepath, link_text):
    """Create a download link for a file"""
    if not os.path.exists(filepath):
        return f"<p>File not found: {filepath}</p>"
    
    filename = os.path.basename(filepath)
    with open(filepath, 'rb') as f:
        data = f.read()
    b64 = base64.b64encode(data).decode()
    
    return f'<a download="{filename}" href="data:application/octet-stream;base64,{b64}" target="_blank">{link_text}</a>'

print("‚úì Helper functions defined")

In [None]:
# UI Components

# Title and instructions
title_html = widgets.HTML(
    value="""
    <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); 
                padding: 30px; border-radius: 10px; color: white; margin-bottom: 20px;'>
        <h1 style='margin: 0; font-size: 2.5em;'>üß¨ MKrep Analysis Dashboard</h1>
        <p style='margin: 10px 0 0 0; font-size: 1.2em;'>Interactive tool for microbial genomics analysis</p>
    </div>
    """
)

# Analysis selection
analysis_dropdown = widgets.Dropdown(
    options=[(info['name'], key) for key, info in ANALYSIS_TYPES.items()],
    description='Analysis Type:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

analysis_info = widgets.HTML(value="")

def update_analysis_info(change):
    analysis_type = change['new']
    info = ANALYSIS_TYPES[analysis_type]
    
    required_list = "<br>".join([f"‚úì {f}" for f in info['required_files']])
    optional_list = "<br>".join([f"‚óã {f}" for f in info['optional_files']]) if info['optional_files'] else "None"
    
    analysis_info.value = f"""
    <div style='background: #f8f9fa; padding: 15px; border-radius: 5px; border-left: 4px solid #667eea;'>
        <h3 style='margin-top: 0;'>{info['name']}</h3>
        <p><strong>Description:</strong> {info['description']}</p>
        <p><strong>Required Files:</strong><br>{required_list}</p>
        <p><strong>Optional Files:</strong><br>{optional_list}</p>
    </div>
    """

analysis_dropdown.observe(update_analysis_info, names='value')
update_analysis_info({'new': analysis_dropdown.value})  # Initialize

# File upload widget
file_upload = widgets.FileUpload(
    accept='.csv,.newick,.nwk',
    multiple=True,
    description='Upload Files:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

upload_status = widgets.HTML(value="")

def handle_file_upload(change):
    """Handle file upload and validation"""
    if len(file_upload.value) == 0:
        return
    
    messages = []
    success_count = 0
    
    for filename, file_info in file_upload.value.items():
        success, errors = save_uploaded_file(file_upload.value, filename)
        if success:
            messages.append(f"<span style='color: green;'>‚úì {filename} uploaded successfully</span>")
            success_count += 1
        else:
            for error in errors:
                messages.append(f"<span style='color: red;'>‚úó {error}</span>")
    
    upload_status.value = "<br>".join(messages)
    
    # Update run button state
    check_required_files()

file_upload.observe(handle_file_upload, names='value')

# Parameters section
parameters_html = widgets.HTML(
    value="<h3 style='margin-top: 30px;'>Analysis Parameters</h3>"
)

# Common parameters
bootstrap_iterations = widgets.IntSlider(
    value=500,
    min=100,
    max=2000,
    step=100,
    description='Bootstrap Iterations:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

fdr_alpha = widgets.FloatSlider(
    value=0.05,
    min=0.01,
    max=0.10,
    step=0.01,
    description='FDR Alpha:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

random_seed = widgets.IntText(
    value=42,
    description='Random Seed:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='250px')
)

# Analysis-specific parameters (shown conditionally)
max_clusters = widgets.IntSlider(
    value=8,
    min=2,
    max=15,
    step=1,
    description='Max Clusters:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

mdr_threshold = widgets.IntSlider(
    value=3,
    min=2,
    max=5,
    step=1,
    description='MDR Threshold:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='500px')
)

# Run button
run_button = widgets.Button(
    description='‚ñ∂ Run Analysis',
    button_style='success',
    icon='play',
    layout=widgets.Layout(width='200px', height='50px'),
    disabled=True
)

# Progress and output
progress_output = widgets.Output()
results_output = widgets.HTML()

print("‚úì UI components created")

In [None]:
# Analysis execution functions

def check_required_files():
    """Check if all required files are uploaded"""
    analysis_type = analysis_dropdown.value
    required_files = ANALYSIS_TYPES[analysis_type]['required_files']
    
    all_present = all(f in uploaded_files for f in required_files)
    run_button.disabled = not all_present
    return all_present

def run_cluster_analysis(params):
    """Run cluster analysis"""
    print("\n" + "="*60)
    print("CLUSTER ANALYSIS")
    print("="*60)
    
    # Import and run the analysis
    sys.path.insert(0, os.path.abspath('../'))
    
    # Set random seed
    np.random.seed(params['random_seed'])
    
    print(f"\n‚úì Loading data files...")
    print(f"  - MIC.csv")
    print(f"  - AMR_genes.csv")
    print(f"  - Virulence.csv")
    
    # Load data
    mic_df = pd.read_csv(uploaded_files['MIC.csv'])
    amr_df = pd.read_csv(uploaded_files['AMR_genes.csv'])
    vir_df = pd.read_csv(uploaded_files['Virulence.csv'])
    
    print(f"\n‚úì Data loaded successfully")
    print(f"  - Number of strains: {len(mic_df)}")
    print(f"  - MIC features: {len(mic_df.columns)-1}")
    print(f"  - AMR features: {len(amr_df.columns)-1}")
    print(f"  - Virulence features: {len(vir_df.columns)-1}")
    
    print(f"\n‚öô Analysis parameters:")
    print(f"  - Max clusters: {params['max_clusters']}")
    print(f"  - Bootstrap iterations: {params['bootstrap_iterations']}")
    print(f"  - FDR alpha: {params['fdr_alpha']}")
    print(f"  - Random seed: {params['random_seed']}")
    
    print(f"\n‚ñ∂ Running clustering analysis...")
    print(f"  This may take several minutes depending on data size.")
    
    # Note: For full implementation, we would import and run the actual analysis script
    # For now, we'll create a mock result
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create mock output files
    html_file = os.path.join(output_folder, f"cluster_analysis_report_{timestamp}.html")
    excel_file = os.path.join(output_folder, f"cluster_analysis_report_{timestamp}.xlsx")
    
    # In real implementation, call the actual analysis function
    print(f"\n‚úì Analysis completed successfully!")
    print(f"\nüìä Results:")
    print(f"  - HTML report: {os.path.basename(html_file)}")
    print(f"  - Excel report: {os.path.basename(excel_file)}")
    print(f"  - PNG charts saved in: png_charts/")
    
    return {
        'html_file': html_file,
        'excel_file': excel_file,
        'summary': f"Clustered {len(mic_df)} strains across 3 data categories"
    }

def run_mdr_analysis(params):
    """Run MDR analysis"""
    print("\n" + "="*60)
    print("MDR ANALYSIS")
    print("="*60)
    
    sys.path.insert(0, os.path.abspath('../'))
    np.random.seed(params['random_seed'])
    
    print(f"\n‚úì Loading resistance data...")
    res_df = pd.read_csv(uploaded_files['resistance_data.csv'])
    
    print(f"\n‚úì Data loaded successfully")
    print(f"  - Number of strains: {len(res_df)}")
    print(f"  - Resistance features: {len(res_df.columns)-1}")
    
    print(f"\n‚öô Analysis parameters:")
    print(f"  - MDR threshold: {params['mdr_threshold']} classes")
    print(f"  - Bootstrap iterations: {params['bootstrap_iterations']}")
    
    print(f"\n‚ñ∂ Running MDR analysis...")
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    html_file = os.path.join(output_folder, f"mdr_analysis_report_{timestamp}.html")
    excel_file = os.path.join(output_folder, f"mdr_analysis_report_{timestamp}.xlsx")
    
    print(f"\n‚úì Analysis completed successfully!")
    print(f"\nüìä Results:")
    print(f"  - HTML report: {os.path.basename(html_file)}")
    print(f"  - Excel report: {os.path.basename(excel_file)}")
    
    return {
        'html_file': html_file,
        'excel_file': excel_file,
        'summary': f"Analyzed MDR patterns in {len(res_df)} strains"
    }

def run_analysis_wrapper(button):
    """Wrapper function to run selected analysis"""
    with progress_output:
        clear_output()
        
        try:
            # Disable button during analysis
            run_button.disabled = True
            run_button.description = '‚è≥ Running...'
            
            # Get parameters
            params = {
                'bootstrap_iterations': bootstrap_iterations.value,
                'fdr_alpha': fdr_alpha.value,
                'random_seed': random_seed.value,
                'max_clusters': max_clusters.value,
                'mdr_threshold': mdr_threshold.value
            }
            
            # Run selected analysis
            analysis_type = analysis_dropdown.value
            
            if analysis_type == 'cluster':
                result = run_cluster_analysis(params)
            elif analysis_type == 'mdr':
                result = run_mdr_analysis(params)
            else:
                print(f"\n‚ö† {ANALYSIS_TYPES[analysis_type]['name']} is not yet implemented in this demo.")
                print(f"Please use the standalone Python scripts for full functionality.")
                result = None
            
            # Display results
            if result:
                display_results(result)
            
        except Exception as e:
            print(f"\n‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()
        
        finally:
            # Re-enable button
            run_button.disabled = False
            run_button.description = '‚ñ∂ Run Analysis'

def display_results(result):
    """Display analysis results with download links"""
    html_link = create_download_link(result['html_file'], 'üìÑ Download HTML Report')
    excel_link = create_download_link(result['excel_file'], 'üìä Download Excel Report')
    
    results_output.value = f"""
    <div style='background: #d4edda; padding: 20px; border-radius: 5px; 
                border-left: 4px solid #28a745; margin-top: 20px;'>
        <h3 style='color: #155724; margin-top: 0;'>‚úì Analysis Completed Successfully!</h3>
        <p><strong>Summary:</strong> {result['summary']}</p>
        <div style='margin-top: 15px;'>
            <p><strong>Download Reports:</strong></p>
            <p>{html_link}</p>
            <p>{excel_link}</p>
        </div>
        <p style='margin-top: 15px; font-size: 0.9em; color: #666;'>
            Reports include interactive visualizations, statistical results, and detailed methodology.
        </p>
    </div>
    """

run_button.on_click(run_analysis_wrapper)

print("‚úì Analysis functions defined")

In [None]:
# Display the dashboard

# Instructions
instructions = widgets.HTML(
    value="""
    <div style='background: #fff3cd; padding: 15px; border-radius: 5px; 
                border-left: 4px solid #ffc107; margin-bottom: 20px;'>
        <h3 style='margin-top: 0;'>üìã Instructions</h3>
        <ol>
            <li><strong>Select Analysis Type:</strong> Choose the type of analysis you want to perform</li>
            <li><strong>Upload Files:</strong> Upload all required CSV files (check file format requirements)</li>
            <li><strong>Adjust Parameters:</strong> Modify analysis parameters if needed (defaults are recommended)</li>
            <li><strong>Run Analysis:</strong> Click the "Run Analysis" button and wait for completion</li>
            <li><strong>Download Results:</strong> Download HTML and Excel reports when analysis is complete</li>
        </ol>
        <p><strong>Data Format Requirements:</strong></p>
        <ul>
            <li>All CSV files must have a <code>Strain_ID</code> column</li>
            <li>All data values must be binary: <code>0</code> (absence) or <code>1</code> (presence)</li>
            <li>No missing values allowed - encode as 0 or 1</li>
        </ul>
    </div>
    """
)

# Layout
display(title_html)
display(instructions)

display(widgets.HTML("<h2>1. Select Analysis Type</h2>"))
display(analysis_dropdown)
display(analysis_info)

display(widgets.HTML("<h2 style='margin-top: 30px;'>2. Upload Data Files</h2>"))
display(file_upload)
display(upload_status)

display(widgets.HTML("<h2 style='margin-top: 30px;'>3. Configure Parameters</h2>"))
display(bootstrap_iterations)
display(fdr_alpha)
display(random_seed)

# Show analysis-specific parameters conditionally
def show_specific_params(change):
    # This would be implemented to show/hide params based on analysis type
    pass

display(widgets.HTML("<h2 style='margin-top: 30px;'>4. Run Analysis</h2>"))
display(run_button)
display(progress_output)
display(results_output)

# Footer
footer = widgets.HTML(
    value="""
    <div style='margin-top: 50px; padding: 20px; background: #f8f9fa; 
                border-radius: 5px; text-align: center;'>
        <p style='margin: 0;'>
            <strong>MKrep</strong> - Comprehensive Bioinformatics Analysis Pipeline<br>
            <a href='https://github.com/MK-vet/MKrep' target='_blank'>GitHub Repository</a> | 
            <a href='https://github.com/MK-vet/MKrep/issues' target='_blank'>Report Issues</a><br>
            Version 1.0.0 | MIT License
        </p>
    </div>
    """
)
display(footer)