In [1]:
from Bio import AlignIO
import os
from pathlib import Path

# Configuration
msa_file = "/home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num"
# Remove position highlighting - keep only 3 sequences for better alignment
max_sequences = 3
output_html = msa_file + "_clean_aligned.html"

# Color palette and positions to highlight
color_palette = [
    "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33",
    "#A65628", "#F781BF", "#1B9E77", "#D95F02", "#7570B3", "#66A61E",
    "#E31A1C", "#1F78B4", "#33A02C", "#FB9A99", "#CAB2D6", "#FDBF6F"
]
positions_to_mark = [3, 4, 48, 52, 47, 99, 54, 177, 144, 178, 74, 143, 139, 87, 142, 102, 104, 107, 24, 60, 181, 173]

def validate_file(filepath):
    """Validate if the MSA file exists and is readable."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"MSA file not found: {filepath}")
    return True

def get_position_color_map():
    """Create a mapping of positions to colors."""
    position_colors = {}
    for i, pos in enumerate(positions_to_mark):
        color_idx = i % len(color_palette)
        position_colors[pos] = color_palette[color_idx]
    return position_colors

def format_aligned_sequence(aligned_seq, position_colors):
    """
    Format aligned sequence with proper spacing, gap handling, and position highlighting.
    """
    result = ""
    for i, aa in enumerate(aligned_seq):
        position = i + 1  # 1-based position
        
        if position in position_colors:
            # Highlighted position
            color = position_colors[position]
            if aa != "-":
                result += f'<span class="aa highlighted" style="background-color: {color}; color: white; font-weight: bold;">{aa}</span>'
            else:
                result += f'<span class="gap highlighted" style="background-color: {color}; opacity: 0.7;">{aa}</span>'
        else:
            # Regular position
            if aa != "-":
                result += f"<span class='aa'>{aa}</span>"
            else:
                result += f"<span class='gap'>{aa}</span>"
    return result

def get_sequence_stats(alignment):
    """Get basic statistics about the alignment."""
    stats = {
        'num_sequences': len(alignment),
        'alignment_length': alignment.get_alignment_length(),
        'max_non_gap_length': 0,
        'min_non_gap_length': float('inf'),
        'avg_non_gap_length': 0
    }
    
    non_gap_lengths = []
    for record in alignment:
        non_gap_count = sum(1 for aa in str(record.seq) if aa != "-")
        non_gap_lengths.append(non_gap_count)
        stats['max_non_gap_length'] = max(stats['max_non_gap_length'], non_gap_count)
        stats['min_non_gap_length'] = min(stats['min_non_gap_length'], non_gap_count)
    
    stats['avg_non_gap_length'] = sum(non_gap_lengths) / len(non_gap_lengths) if non_gap_lengths else 0
    return stats

def generate_html_content(alignment, stats, position_colors):
    """Generate the simplified HTML content with only colored sequences."""
    
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MSA Sequences</title>
    <style>
        * {{
            box-sizing: border-box;
        }}
        
        body {{ 
            font-family: 'Courier New', monospace; 
            background: white;
            margin: 0;
            padding: 20px; 
            font-size: 14px;
            line-height: 1.6;
        }}
        
        .msa-container {{
            padding: 0;
            overflow-x: auto;
            background: #ffffff;
        }}
        
        .sequence-row {{
            margin: 8px 0;
            white-space: nowrap;
            font-family: 'Courier New', monospace;
            display: flex;
            align-items: center;
        }}
        
        .seq-name {{
            display: inline-block;
            width: 300px;
            font-weight: bold;
            margin-right: 20px;
            padding: 8px 12px;
            background: #f8f9fa;
            color: #495057;
            border-radius: 4px;
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
            font-size: 13px;
            flex-shrink: 0;
        }}
        
        .sequence {{
            font-family: 'Courier New', monospace;
            letter-spacing: 1px;
            font-size: 14px;
            flex-grow: 1;
        }}
        
        .aa {{
            color: #2c3e50;
            font-weight: 600;
            padding: 2px 1px;
            border-radius: 2px;
        }}
        
        .gap {{
            color: #95a5a6;
            opacity: 0.6;
            font-weight: 300;
            padding: 2px 1px;
            border-radius: 2px;
        }}
        
        .highlighted {{
            border-radius: 3px;
            padding: 2px 3px;
            margin: 0 1px;
            box-shadow: 0 1px 2px rgba(0,0,0,0.1);
        }}
        
        /* Responsive design */
        @media (max-width: 768px) {{
            .seq-name {{
                width: 200px;
                font-size: 11px;
            }}
            
            .sequence {{
                font-size: 12px;
                letter-spacing: 0.5px;
            }}
            
            body {{
                font-size: 12px;
                padding: 10px;
            }}
        }}
        
        /* Improved alignment with monospace consistency */
        .msa {{
            font-family: 'Courier New', 'Lucida Console', monospace;
            font-size: 14px;
            line-height: 1.8;
        }}
        
        /* Enhanced visual hierarchy */
        .sequence-row:nth-child(even) {{
            background: rgba(248, 249, 250, 0.5);
        }}
        
        .sequence-row:hover {{
            background: rgba(74, 144, 226, 0.02);
        }}
        
        /* Highlight effects */
        .highlighted:hover {{
            transform: scale(1.05);
            transition: transform 0.1s ease;
        }}
    </style>
</head>
<body>
    <div class="msa-container">
        <div class="msa">"""
    
    # Process only the first few sequences for better alignment
    sequences_to_show = min(len(alignment), max_sequences)
    
    for i in range(sequences_to_show):
        record = alignment[i]
        seq_name = record.id
        if len(seq_name) > 45:
            seq_name = seq_name[:42] + "..."
        
        formatted_seq = format_aligned_sequence(str(record.seq), position_colors)
        html_content += f'            <div class="sequence-row">\n'
        html_content += f'                <span class="seq-name" title="{record.id}">{seq_name}</span>\n'
        html_content += f'                <span class="sequence">{formatted_seq}</span>\n'
        html_content += f'            </div>\n'

    html_content += f"""        </div>
    </div>
</body>
</html>"""
    
    return html_content

def main():
    """Main function to generate the simplified HTML MSA visualization."""
    try:
        # Validate input file
        validate_file(msa_file)
        print(f"✅ Reading alignment from: {msa_file}")
        
        # Read the alignment
        alignment = AlignIO.read(msa_file, "clustal")
        print(f"✅ Loaded {len(alignment)} sequences")
        
        # Get alignment statistics
        stats = get_sequence_stats(alignment)
        print(f"✅ Alignment length: {stats['alignment_length']}")
        print(f"✅ Will show top {max_sequences} sequences")
        
        # Create position color mapping
        position_colors = get_position_color_map()
        print(f"✅ Highlighting {len(positions_to_mark)} positions with colors")
        
        # Generate HTML content
        print("🔄 Generating simplified HTML content...")
        html_content = generate_html_content(alignment, stats, position_colors)
        
        # Write HTML file
        with open(output_html, "w", encoding="utf-8") as f:
            f.write(html_content)
        
        print(f"✅ Simplified HTML file generated: {output_html}")
        print(f"📊 File size: {os.path.getsize(output_html):,} bytes")
        
        print("\n" + "="*60)
        print("SIMPLIFIED ALIGNMENT SUMMARY:")
        print("="*60)
        print(f"📁 Input file: {Path(msa_file).name}")
        print(f"📄 Output file: {Path(output_html).name}")
        print(f"👁️ Sequences displayed: {min(stats['num_sequences'], max_sequences)}")
        print(f"📏 Alignment length: {stats['alignment_length']}")
        print(f"🎨 Highlighted positions: {len(positions_to_mark)}")
        print(f"🔗 Open the HTML file in your browser to view the sequences!")
        
        # Show first few sequence names
        print(f"\n📋 Sequences being displayed:")
        for i in range(min(len(alignment), max_sequences)):
            print(f"  {i+1}. {alignment[i].id}")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

if __name__ == "__main__":
    main()

✅ Reading alignment from: /home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
✅ Loaded 3 sequences
✅ Alignment length: 197
✅ Will show top 3 sequences
✅ Highlighting 22 positions with colors
🔄 Generating simplified HTML content...
✅ Simplified HTML file generated: /home/hp/nayanika/github/GPX6/analysis/alignment/clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_clean_aligned.html
📊 File size: 23,642 bytes

SIMPLIFIED ALIGNMENT SUMMARY:
📁 Input file: clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num
📄 Output file: clustalo-I20250513-124120-0721-98866692-p1m.aln-clustal_num_clean_aligned.html
👁️ Sequences displayed: 3
📏 Alignment length: 197
🎨 Highlighted positions: 22
🔗 Open the HTML file in your browser to view the sequences!

📋 Sequences being displayed:
  1. protein_mousecys
  2. protein_ancestor
  3. protein_humansec
