In [6]:
from Bio import AlignIO
import os
from pathlib import Path

# Configuration
msa_file = "/home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num"
max_sequences = 3  # Only top N sequences to display
output_html = msa_file + "_paper_ready.html"

# Positions to highlight
color_palette = [
    "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33",
    "#A65628", "#F781BF", "#1B9E77", "#D95F02", "#7570B3", "#66A61E",
    "#E31A1C", "#1F78B4", "#33A02C", "#FB9A99", "#CAB2D6", "#FDBF6F"
]
positions_to_mark = [3, 4, 48, 52, 47, 99, 54, 177, 144, 178, 74, 143, 139, 87, 142, 102, 104, 107, 24, 60, 181, 173]

def validate_file(filepath):
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"MSA file not found: {filepath}")
    return True

def get_position_color_map():
    position_colors = {}
    for i, pos in enumerate(positions_to_mark):
        color_idx = i % len(color_palette)
        position_colors[pos] = color_palette[color_idx]
    return position_colors

def format_aligned_sequence(aligned_seq, position_colors):
    """Format aligned sequence with highlighting."""
    result = ""
    for i, aa in enumerate(aligned_seq):
        position = i + 1  # 1-based index

        if position in position_colors:
            color = position_colors[position]
            if aa != "-":
                result += f'<span class="aa highlighted" style="background-color: {color}; color: white; font-weight: bold;">{aa}</span>'
            else:
                result += f'<span class="gap highlighted" style="background-color: {color}; opacity: 0.7;">{aa}</span>'
        else:
            if aa != "-":
                result += f"<span class='aa'>{aa}</span>"
            else:
                result += f"<span class='gap'>{aa}</span>"
    return result

def get_sequence_stats(alignment):
    return {
        'num_sequences': len(alignment),
        'alignment_length': alignment.get_alignment_length()
    }

def generate_html_content(alignment, stats, position_colors):
    """Generate a compact, single-page HTML with no scrolling."""
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MSA Paper Figure</title>
    <style>
        /* Full page layout */
        html, body {{
            margin: 0;
            padding: 0;
            height: 100%;
            overflow: hidden; /* Prevent scrolling */
            background: white;
            font-family: 'Courier New', monospace;
        }}

        .msa-container {{
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: center;
            height: 100%;
            width: 100%;
            padding: 10px;
            box-sizing: border-box;
        }}

        h2 {{
            font-size: 22px;
            margin-bottom: 10px;
            color: #2c3e50;
        }}

        .sequence-row {{
            white-space: nowrap;
            display: flex;
            align-items: center;
            justify-content: center;
        }}

        /* Sequence name styling */
        .seq-name {{
            display: inline-block;
            width: 200px;
            font-weight: bold;
            margin-right: 10px;
            font-size: 14px;
            color: #2c3e50;
            text-align: right;
        }}

        /* Sequence area */
        .sequence {{
            font-family: 'Courier New', monospace;
            letter-spacing: 1px;
            font-size: calc(12px * 50vw / {stats['alignment_length']});
            /* Dynamically scales with alignment length */
            display: inline-block;
        }}

        .aa {{
            color: #2c3e50;
            font-weight: 600;
            padding: 0 1px;
        }}

        .gap {{
            color: #95a5a6;
            opacity: 0.6;
            padding: 0 1px;
        }}

        .highlighted {{
            border-radius: 2px;
            padding: 0 2px;
        }}

        /* Make everything shrink nicely to fit */
        @media print {{
            body {{
                overflow: visible;
            }}
        }}
    </style>
</head>
<body>
    <div class="msa-container">
        <h2>Multiple Sequence Alignment</h2>
"""

    # Only display top N sequences
    sequences_to_show = min(len(alignment), max_sequences)
    for i in range(sequences_to_show):
        record = alignment[i]
        seq_name = record.id if len(record.id) <= 30 else record.id[:27] + "..."
        formatted_seq = format_aligned_sequence(str(record.seq), position_colors)
        html_content += f"""        <div class="sequence-row">
            <span class="seq-name">{seq_name}</span>
            <span class="sequence">{formatted_seq}</span>
        </div>
"""

    html_content += """
    </div>
</body>
</html>
"""
    return html_content

def main():
    try:
        validate_file(msa_file)
        print(f"Reading alignment from: {msa_file}")
        
        alignment = AlignIO.read(msa_file, "clustal")
        stats = get_sequence_stats(alignment)
        
        position_colors = get_position_color_map()
        print(f"Highlighting {len(positions_to_mark)} positions")

        html_content = generate_html_content(alignment, stats, position_colors)

        with open(output_html, "w", encoding="utf-8") as f:
            f.write(html_content)

        print(f"Paper-ready HTML alignment generated: {output_html}")
        print(f"Sequences displayed: {min(stats['num_sequences'], max_sequences)}")
        print(f"Alignment length: {stats['alignment_length']}")

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    main()


Reading alignment from: /home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
Highlighting 22 positions
Paper-ready HTML alignment generated: /home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_paper_ready.html
Sequences displayed: 2
Alignment length: 197
