In [3]:
from Bio import AlignIO
import os
from pathlib import Path

# Configuration
msa_file = "/home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num"
# Remove position highlighting - keep only 3 sequences for better alignment
max_sequences = 3
output_html = msa_file + "_clean_aligned.html"

# Color palette and positions to highlight
color_palette = [
    "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33",
    "#A65628", "#F781BF", "#1B9E77", "#D95F02", "#7570B3", "#66A61E",
    "#E31A1C", "#1F78B4", "#33A02C", "#FB9A99", "#CAB2D6", "#FDBF6F"
]
positions_to_mark = [3, 4, 48, 52, 47, 99, 54, 177, 144, 178, 74, 143, 139, 87, 142, 102, 104, 107, 24, 60, 181, 173]

def validate_file(filepath):
    """Validate if the MSA file exists and is readable."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"MSA file not found: {filepath}")
    return True

def get_position_color_map():
    """Create a mapping of positions to colors."""
    position_colors = {}
    for i, pos in enumerate(positions_to_mark):
        color_idx = i % len(color_palette)
        position_colors[pos] = color_palette[color_idx]
    return position_colors

def format_aligned_sequence(aligned_seq, position_colors):
    """
    Format aligned sequence with proper spacing, gap handling, and position highlighting.
    """
    result = ""
    for i, aa in enumerate(aligned_seq):
        position = i + 1  # 1-based position
        
        if position in position_colors:
            # Highlighted position
            color = position_colors[position]
            if aa != "-":
                result += f'<span class="aa highlighted" style="background-color: {color}; color: white; font-weight: bold;">{aa}</span>'
            else:
                result += f'<span class="gap highlighted" style="background-color: {color}; opacity: 0.7;">{aa}</span>'
        else:
            # Regular position
            if aa != "-":
                result += f"<span class='aa'>{aa}</span>"
            else:
                result += f"<span class='gap'>{aa}</span>"
    return result

def get_sequence_stats(alignment):
    """Get basic statistics about the alignment."""
    stats = {
        'num_sequences': len(alignment),
        'alignment_length': alignment.get_alignment_length(),
        'max_non_gap_length': 0,
        'min_non_gap_length': float('inf'),
        'avg_non_gap_length': 0
    }
    
    non_gap_lengths = []
    for record in alignment:
        non_gap_count = sum(1 for aa in str(record.seq) if aa != "-")
        non_gap_lengths.append(non_gap_count)
        stats['max_non_gap_length'] = max(stats['max_non_gap_length'], non_gap_count)
        stats['min_non_gap_length'] = min(stats['min_non_gap_length'], non_gap_count)
    
    stats['avg_non_gap_length'] = sum(non_gap_lengths) / len(non_gap_lengths) if non_gap_lengths else 0
    return stats

def generate_position_ruler(alignment_length, position_colors):
    """Generate a ruler showing position numbers every 10 positions with highlighting."""
    ruler_html = '<div class="ruler">\n'
    ruler_html += '<span class="seq-name">Position</span>'
    
    # Create ruler with position markers every 10 positions
    for i in range(1, alignment_length + 1):
        if i in position_colors:
            color = position_colors[i]
            if i % 10 == 0:
                ruler_html += f'<span class="ruler-tick highlighted" style="background-color: {color}; color: white; font-weight: bold;">{i % 100}</span>'
            else:
                ruler_html += f'<span class="ruler-highlight" style="background-color: {color};">▲</span>'
        elif i % 10 == 0:
            ruler_html += f'<span class="ruler-tick">{i % 100}</span>'
        elif i % 5 == 0:
            ruler_html += '<span class="ruler-minor">·</span>'
        else:
            ruler_html += '<span class="ruler-space"> </span>'
    
    ruler_html += '\n</div>\n'
    return ruler_html

def generate_legend(position_colors):
    """Generate a legend showing the highlighted positions and their colors."""
    legend_html = '<div class="legend">\n'
    legend_html += '<h3>Highlighted Positions</h3>\n'
    legend_html += '<div class="legend-items">\n'
    
    # Sort positions for better display
    sorted_positions = sorted(position_colors.keys())
    
    for pos in sorted_positions:
        color = position_colors[pos]
        legend_html += f'<div class="legend-item">\n'
        legend_html += f'<span class="legend-color" style="background-color: {color};"></span>\n'
        legend_html += f'<span class="legend-label">Position {pos}</span>\n'
        legend_html += f'</div>\n'
    
    legend_html += '</div>\n'
    legend_html += '</div>\n'
    return legend_html

def generate_html_content(alignment, stats, position_colors):
    """Generate the complete HTML content."""
    
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MSA Clean Alignment with Position Highlighting</title>
    <style>
        * {{
            box-sizing: border-box;
        }}
        
        body {{ 
            font-family: 'Courier New', monospace; 
            background: #f8f9fa;
            margin: 0;
            padding: 20px; 
            font-size: 14px;
            line-height: 1.6;
        }}
        
        .container {{
            max-width: 100%;
            margin: 0 auto;
            background: white;
            border-radius: 8px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
            overflow: hidden;
        }}
        
        .header {{
            background: linear-gradient(135deg, #4a90e2 0%, #357abd 100%);
            color: white;
            padding: 20px;
            text-align: center;
        }}
        
        .header h1 {{
            margin: 0 0 10px 0;
            font-size: 24px;
            font-weight: 600;
        }}
        
        .stats {{
            background: #f8f9fa;
            padding: 15px 20px;
            border-bottom: 1px solid #dee2e6;
            display: flex;
            justify-content: space-around;
            flex-wrap: wrap;
        }}
        
        .stat-item {{
            text-align: center;
            margin: 5px;
        }}
        
        .stat-value {{
            font-size: 20px;
            font-weight: bold;
            color: #4a90e2;
        }}
        
        .stat-label {{
            font-size: 12px;
            color: #6c757d;
            text-transform: uppercase;
        }}
        
        .legend {{
            background: #f8f9fa;
            padding: 20px;
            border-bottom: 1px solid #dee2e6;
        }}
        
        .legend h3 {{
            margin: 0 0 15px 0;
            color: #495057;
            font-size: 16px;
        }}
        
        .legend-items {{
            display: flex;
            flex-wrap: wrap;
            gap: 10px;
        }}
        
        .legend-item {{
            display: flex;
            align-items: center;
            margin: 2px 8px;
        }}
        
        .legend-color {{
            width: 16px;
            height: 16px;
            border-radius: 3px;
            margin-right: 6px;
            border: 1px solid #ccc;
        }}
        
        .legend-label {{
            font-size: 12px;
            color: #495057;
            font-weight: 500;
        }}
        
        .msa-container {{
            padding: 20px;
            overflow-x: auto;
            background: #ffffff;
        }}
        
        .sequence-row {{
            margin: 8px 0;
            white-space: nowrap;
            font-family: 'Courier New', monospace;
            display: flex;
            align-items: center;
        }}
        
        .seq-name {{
            display: inline-block;
            width: 300px;
            font-weight: bold;
            margin-right: 20px;
            padding: 8px 12px;
            background: linear-gradient(135deg, #e9ecef 0%, #dee2e6 100%);
            color: #495057;
            border-radius: 4px;
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
            font-size: 13px;
            flex-shrink: 0;
        }}
        
        .sequence {{
            font-family: 'Courier New', monospace;
            letter-spacing: 1px;
            font-size: 14px;
            flex-grow: 1;
        }}
        
        .aa {{
            color: #2c3e50;
            font-weight: 600;
            padding: 2px 1px;
            border-radius: 2px;
        }}
        
        .gap {{
            color: #95a5a6;
            opacity: 0.6;
            font-weight: 300;
            padding: 2px 1px;
            border-radius: 2px;
        }}
        
        .highlighted {{
            border-radius: 3px;
            padding: 2px 3px;
            margin: 0 1px;
            box-shadow: 0 1px 2px rgba(0,0,0,0.1);
        }}
        
        .ruler {{
            margin: 15px 0;
            border-bottom: 2px solid #dee2e6;
            padding-bottom: 8px;
            font-size: 11px;
            display: flex;
            align-items: center;
        }}
        
        .ruler-tick {{
            color: #495057;
            font-weight: bold;
            font-size: 10px;
            width: 15px;
            text-align: center;
            padding: 1px;
            border-radius: 2px;
        }}
        
        .ruler-minor {{
            color: #6c757d;
            font-weight: normal;
            width: 15px;
            text-align: center;
        }}
        
        .ruler-space {{
            width: 15px;
            text-align: center;
        }}
        
        .ruler-highlight {{
            width: 15px;
            text-align: center;
            font-size: 8px;
            color: white;
            font-weight: bold;
        }}
        
        .footer {{
            background: #f8f9fa;
            padding: 15px 20px;
            border-top: 1px solid #dee2e6;
            font-size: 12px;
            color: #6c757d;
            text-align: center;
        }}
        
        /* Responsive design */
        @media (max-width: 768px) {{
            .seq-name {{
                width: 200px;
                font-size: 11px;
            }}
            
            .sequence {{
                font-size: 12px;
                letter-spacing: 0.5px;
            }}
            
            body {{
                font-size: 12px;
                padding: 10px;
            }}
            
            .legend-items {{
                gap: 5px;
            }}
            
            .legend-item {{
                margin: 1px 4px;
            }}
        }}
        
        /* Improved alignment with monospace consistency */
        .msa {{
            font-family: 'Courier New', 'Lucida Console', monospace;
            font-size: 14px;
            line-height: 1.8;
        }}
        
        /* Enhanced visual hierarchy */
        .sequence-row:nth-child(even) {{
            background: rgba(74, 144, 226, 0.02);
        }}
        
        .sequence-row:hover {{
            background: rgba(74, 144, 226, 0.05);
        }}
        
        /* Highlight effects */
        .highlighted:hover {{
            transform: scale(1.05);
            transition: transform 0.1s ease;
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>Multiple Sequence Alignment</h1>
            <p>Clean Alignment View with Position Highlighting - Top {max_sequences} Sequences</p>
        </div>
        
        <div class="stats">
            <div class="stat-item">
                <div class="stat-value">{min(stats['num_sequences'], max_sequences)}</div>
                <div class="stat-label">Sequences Shown</div>
            </div>
            <div class="stat-item">
                <div class="stat-value">{stats['alignment_length']}</div>
                <div class="stat-label">Alignment Length</div>
            </div>
            <div class="stat-item">
                <div class="stat-value">{stats['avg_non_gap_length']:.0f}</div>
                <div class="stat-label">Avg. Seq Length</div>
            </div>
            <div class="stat-item">
                <div class="stat-value">{len(positions_to_mark)}</div>
                <div class="stat-label">Highlighted Positions</div>
            </div>
        </div>"""
    
    # Add legend
    html_content += generate_legend(position_colors)
    
    html_content += f"""
        <div class="msa-container">
            <div class="msa">"""
    
    # Add position ruler
    html_content += generate_position_ruler(stats['alignment_length'], position_colors)
    
    # Process only the first few sequences for better alignment
    sequences_to_show = min(len(alignment), max_sequences)
    
    for i in range(sequences_to_show):
        record = alignment[i]
        seq_name = record.id
        if len(seq_name) > 45:
            seq_name = seq_name[:42] + "..."
        
        formatted_seq = format_aligned_sequence(str(record.seq), position_colors)
        html_content += f'                <div class="sequence-row">\n'
        html_content += f'                    <span class="seq-name" title="{record.id}">{seq_name}</span>\n'
        html_content += f'                    <span class="sequence">{formatted_seq}</span>\n'
        html_content += f'                </div>\n'

    html_content += f"""            </div>
        </div>
        
        <div class="footer">
            <strong>File:</strong> {Path(msa_file).name}
            <br>
            <strong>Showing:</strong> {sequences_to_show} of {stats['num_sequences']} sequences
            <br>
            <strong>Highlighted Positions:</strong> {', '.join(map(str, sorted(positions_to_mark)))}
            <br>
            <strong>Generated:</strong> {Path(output_html).name}
        </div>
    </div>
</body>
</html>"""
    
    return html_content

def main():
    """Main function to generate the clean HTML MSA visualization."""
    try:
        # Validate input file
        validate_file(msa_file)
        print(f"✅ Reading alignment from: {msa_file}")
        
        # Read the alignment
        alignment = AlignIO.read(msa_file, "clustal")
        print(f"✅ Loaded {len(alignment)} sequences")
        
        # Get alignment statistics
        stats = get_sequence_stats(alignment)
        print(f"✅ Alignment length: {stats['alignment_length']}")
        print(f"✅ Will show top {max_sequences} sequences for better alignment")
        
        # Create position color mapping
        position_colors = get_position_color_map()
        print(f"✅ Highlighting {len(positions_to_mark)} positions with colors")
        
        # Generate HTML content
        print("🔄 Generating clean HTML content with position highlighting...")
        html_content = generate_html_content(alignment, stats, position_colors)
        
        # Write HTML file
        with open(output_html, "w", encoding="utf-8") as f:
            f.write(html_content)
        
        print(f"✅ Clean HTML file generated: {output_html}")
        print(f"📊 File size: {os.path.getsize(output_html):,} bytes")
        
        print("\n" + "="*60)
        print("CLEAN ALIGNMENT SUMMARY:")
        print("="*60)
        print(f"📁 Input file: {Path(msa_file).name}")
        print(f"📄 Output file: {Path(output_html).name}")
        print(f"🔢 Total sequences available: {stats['num_sequences']}")
        print(f"👁️ Sequences displayed: {min(stats['num_sequences'], max_sequences)}")
        print(f"📏 Alignment length: {stats['alignment_length']}")
        print(f"🎨 Highlighted positions: {len(positions_to_mark)}")
        print(f"🔗 Open the HTML file in your browser to view the clean alignment!")
        
        # Show first few sequence names
        print(f"\n📋 Sequences being displayed:")
        for i in range(min(len(alignment), max_sequences)):
            print(f"  {i+1}. {alignment[i].id}")
        
        # Show highlighted positions
        print(f"\n🎨 Highlighted positions with colors:")
        for i, pos in enumerate(sorted(positions_to_mark)):
            color_idx = positions_to_mark.index(pos) % len(color_palette)
            color = color_palette[color_idx]
            print(f"  Position {pos}: {color}")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

if __name__ == "__main__":
    main()

✅ Reading alignment from: /home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
✅ Loaded 3 sequences
✅ Alignment length: 197
✅ Will show top 3 sequences for better alignment
✅ Highlighting 22 positions with colors
🔄 Generating clean HTML content with position highlighting...
✅ Clean HTML file generated: /home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_clean_aligned.html
📊 File size: 39,499 bytes

CLEAN ALIGNMENT SUMMARY:
📁 Input file: clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
📄 Output file: clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_clean_aligned.html
🔢 Total sequences available: 3
👁️ Sequences displayed: 3
📏 Alignment length: 197
🎨 Highlighted positions: 22
🔗 Open the HTML file in your browser to view the clean alignment!

📋 Sequences being displayed:
  1. protein_chainX
  2. protein_chainA
  3. protein_chainY

🎨 Highlighted positions wi