In [2]:
from Bio import AlignIO
import os
from pathlib import Path

# Configuration
msa_file = "/home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num"
positions_to_mark = [3, 4, 48, 52, 47, 99, 54, 177, 144, 178, 74, 143, 139, 87, 142, 102, 104, 107, 24, 60, 181, 173]
output_html = msa_file + "_enhanced_colored.html"

# Enhanced color palette with better contrast
color_palette = [
    "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33",
    "#A65628", "#F781BF", "#1B9E77", "#D95F02", "#7570B3", "#66A61E",
    "#E31A1C", "#1F78B4", "#33A02C", "#FB9A99", "#CAB2D6", "#FDBF6F"
]

# Create position-color mapping
position_colors = {pos: color_palette[i % len(color_palette)] for i, pos in enumerate(positions_to_mark)}

def validate_file(filepath):
    """Validate if the MSA file exists and is readable."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"MSA file not found: {filepath}")
    return True

def color_aligned_sequence(aligned_seq, mark_positions, color_dict):
    """
    Color specific positions in an aligned sequence.
    mark_positions should be 1-based positions in the original sequence.
    """
    result = ""
    orig_pos = 0  # Track original sequence position (1-based)
    
    for aa in aligned_seq:
        if aa != "-":
            orig_pos += 1
            # Check if this original position should be colored
            if orig_pos in mark_positions:
                color = color_dict[orig_pos]
                # Use better contrast for certain amino acids
                text_color = "white" if color != "#FFFF33" else "black"
                result += f"<span class='highlight' style='background-color:{color}; color:{text_color};'>{aa}</span>"
            else:
                result += aa
        else:
            # This is a gap character
            result += f"<span class='gap'>{aa}</span>"
    return result

def get_sequence_stats(alignment):
    """Get basic statistics about the alignment."""
    stats = {
        'num_sequences': len(alignment),
        'alignment_length': alignment.get_alignment_length(),
        'max_non_gap_length': 0,
        'min_non_gap_length': float('inf'),
        'avg_non_gap_length': 0
    }
    
    non_gap_lengths = []
    for record in alignment:
        non_gap_count = sum(1 for aa in str(record.seq) if aa != "-")
        non_gap_lengths.append(non_gap_count)
        stats['max_non_gap_length'] = max(stats['max_non_gap_length'], non_gap_count)
        stats['min_non_gap_length'] = min(stats['min_non_gap_length'], non_gap_count)
    
    stats['avg_non_gap_length'] = sum(non_gap_lengths) / len(non_gap_lengths)
    return stats

def generate_position_ruler(alignment_length, mark_positions, position_colors):
    """Generate a ruler showing position numbers."""
    ruler_html = '<div class="ruler">\n'
    ruler_html += '<span class="seq-name">Position Ruler</span>'
    
    # Create ruler with marked positions
    orig_pos = 0
    for i in range(alignment_length):
        if i % 10 == 0:  # Show every 10th position
            if orig_pos + 1 in mark_positions:
                color = position_colors[orig_pos + 1]
                ruler_html += f'<span class="ruler-marked" style="background-color:{color};">|</span>'
            else:
                ruler_html += '<span class="ruler-tick">|</span>'
        else:
            ruler_html += '<span class="ruler-space"> </span>'
        orig_pos += 1
    
    ruler_html += '\n</div>\n'
    return ruler_html

def generate_html_content(alignment, positions_to_mark, position_colors, stats):
    """Generate the complete HTML content."""
    
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MSA Highlighted Positions</title>
    <style>
        * {{
            box-sizing: border-box;
        }}
        
        body {{ 
            font-family: 'Courier New', monospace; 
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            margin: 0;
            padding: 20px; 
            font-size: 12px;
            line-height: 1.4;
        }}
        
        .container {{
            max-width: 100%;
            margin: 0 auto;
            background: white;
            border-radius: 10px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            overflow: hidden;
        }}
        
        .header {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            text-align: center;
        }}
        
        .header h1 {{
            margin: 0 0 10px 0;
            font-size: 24px;
        }}
        
        .stats {{
            background: #f8f9fa;
            padding: 15px;
            border-bottom: 1px solid #dee2e6;
        }}
        
        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 10px;
        }}
        
        .stat-item {{
            background: white;
            padding: 10px;
            border-radius: 5px;
            border-left: 4px solid #667eea;
        }}
        
        .legend {{
            margin: 20px;
            padding: 15px;
            background: #f8f9fa;
            border-radius: 8px;
            border: 1px solid #dee2e6;
        }}
        
        .legend h3 {{
            margin-top: 0;
            color: #495057;
        }}
        
        .legend-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
            gap: 8px;
            margin-top: 10px;
        }}
        
        .legend-item {{
            padding: 5px 10px;
            color: white;
            font-weight: bold;
            border-radius: 5px;
            text-align: center;
            font-size: 11px;
        }}
        
        .msa-container {{
            padding: 20px;
            overflow-x: auto;
        }}
        
        .sequence-row {{
            margin: 2px 0;
            white-space: nowrap;
            font-family: 'Courier New', monospace;
        }}
        
        .seq-name {{
            display: inline-block;
            width: 250px;
            font-weight: bold;
            margin-right: 15px;
            padding: 2px 5px;
            background: #e9ecef;
            color: #495057;
            border-radius: 3px;
            vertical-align: top;
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
        }}
        
        .sequence {{
            font-family: 'Courier New', monospace;
            letter-spacing: 0.5px;
        }}
        
        .highlight {{
            font-weight: bold;
            padding: 1px 2px;
            border-radius: 2px;
            margin: 0 1px;
        }}
        
        .gap {{
            color: #6c757d;
            opacity: 0.7;
        }}
        
        .ruler {{
            margin: 10px 0;
            border-bottom: 2px solid #dee2e6;
            padding-bottom: 5px;
        }}
        
        .ruler-tick {{
            color: #495057;
            font-weight: bold;
        }}
        
        .ruler-marked {{
            color: white;
            font-weight: bold;
            padding: 0 1px;
            border-radius: 2px;
        }}
        
        .ruler-space {{
            color: transparent;
        }}
        
        .footer {{
            background: #f8f9fa;
            padding: 15px 20px;
            border-top: 1px solid #dee2e6;
            font-size: 11px;
            color: #6c757d;
        }}
        
        @media (max-width: 768px) {{
            .seq-name {{
                width: 150px;
                font-size: 10px;
            }}
            
            body {{
                font-size: 10px;
            }}
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>Multiple Sequence Alignment</h1>
            <p>Highlighted Mutant Positions Analysis</p>
        </div>
        
        <div class="stats">
            <h3>Alignment Statistics</h3>
            <div class="stats-grid">
                <div class="stat-item">
                    <strong>Sequences:</strong> {stats['num_sequences']}
                </div>
                <div class="stat-item">
                    <strong>Alignment Length:</strong> {stats['alignment_length']}
                </div>
                <div class="stat-item">
                    <strong>Positions Highlighted:</strong> {len(positions_to_mark)}
                </div>
                <div class="stat-item">
                    <strong>Avg. Sequence Length:</strong> {stats['avg_non_gap_length']:.1f}
                </div>
            </div>
        </div>

        <div class="legend">
            <h3>Highlighted Positions Legend</h3>
            <div class="legend-grid">"""
    
    # Add legend items
    for pos in sorted(positions_to_mark):
        color = position_colors[pos]
        text_color = "white" if color != "#FFFF33" else "black"
        html_content += f'                <div class="legend-item" style="background-color:{color}; color:{text_color};">Position {pos}</div>\n'
    
    html_content += """            </div>
        </div>

        <div class="msa-container">
            <div class="msa">"""
    
    # Add position ruler
    # html_content += generate_position_ruler(stats['alignment_length'], positions_to_mark, position_colors)
    
    # Process each sequence
    for i, record in enumerate(alignment):
        seq_name = record.id[:40] if len(record.id) <= 40 else record.id[:37] + "..."
        colored_seq = color_aligned_sequence(str(record.seq), positions_to_mark, position_colors)
        html_content += f'                <div class="sequence-row">\n'
        html_content += f'                    <span class="seq-name" title="{record.id}">{seq_name}</span>\n'
        html_content += f'                    <span class="sequence">{colored_seq}</span>\n'
        html_content += f'                </div>\n'

    html_content += f"""            </div>
        </div>
        
        <div class="footer">
            <strong>Summary:</strong> 
            {len(positions_to_mark)} positions highlighted: {', '.join(map(str, sorted(positions_to_mark)))}
            <br>
            <strong>File:</strong> {Path(msa_file).name}
            <br>
            <strong>Generated:</strong> {Path(output_html).name}
        </div>
    </div>
</body>
</html>"""
    
    return html_content

def main():
    """Main function to generate the HTML MSA visualization."""
    try:
        # Validate input file
        validate_file(msa_file)
        print(f"✅ Reading alignment from: {msa_file}")
        
        # Read the alignment
        alignment = AlignIO.read(msa_file, "clustal")
        print(f"✅ Loaded {len(alignment)} sequences")
        
        # Get alignment statistics
        stats = get_sequence_stats(alignment)
        print(f"✅ Alignment length: {stats['alignment_length']}")
        print(f"✅ Sequence lengths: {stats['min_non_gap_length']}-{stats['max_non_gap_length']} (avg: {stats['avg_non_gap_length']:.1f})")
        
        # Validate positions
        max_position = max(positions_to_mark)
        if max_position > stats['max_non_gap_length']:
            print(f"⚠️  WARNING: Position {max_position} exceeds maximum sequence length ({stats['max_non_gap_length']})")
        
        # Generate HTML content
        print("🔄 Generating HTML content...")
        html_content = generate_html_content(alignment, positions_to_mark, position_colors, stats)
        
        # Write HTML file
        with open(output_html, "w", encoding="utf-8") as f:
            f.write(html_content)
        
        print(f"✅ Enhanced HTML file generated: {output_html}")
        print(f"📊 File size: {os.path.getsize(output_html):,} bytes")
        
        # Final verification
        print("\n" + "="*60)
        print("VERIFICATION SUMMARY:")
        print("="*60)
        
        sequences_with_issues = 0
        for i, record in enumerate(alignment):
            non_gap_count = sum(1 for aa in str(record.seq) if aa != "-")
            if non_gap_count < max_position:
                sequences_with_issues += 1
                if sequences_with_issues <= 3:  # Show only first 3
                    print(f"⚠️  Sequence {i+1} ({record.id[:30]}): only {non_gap_count} positions")
        
        if sequences_with_issues > 3:
            print(f"⚠️  ... and {sequences_with_issues - 3} more sequences with similar issues")
        
        print(f"\n📍 Positions to highlight: {sorted(positions_to_mark)}")
        print(f"🎨 Colors used: {len(set(position_colors.values()))}")
        print(f"🔗 Open the HTML file in your browser to view the results!")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

if __name__ == "__main__":
    main()

✅ Reading alignment from: /home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
✅ Loaded 3 sequences
✅ Alignment length: 197
✅ Sequence lengths: 195-197 (avg: 196.0)
🔄 Generating HTML content...
✅ Enhanced HTML file generated: /home/hp/nayanika/github/GPX6/analysis/structure/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_enhanced_colored.html
📊 File size: 14,376 bytes

VERIFICATION SUMMARY:

📍 Positions to highlight: [3, 4, 24, 47, 48, 52, 54, 60, 74, 87, 99, 102, 104, 107, 139, 142, 143, 144, 173, 177, 178, 181]
🎨 Colors used: 18
🔗 Open the HTML file in your browser to view the results!
