In [1]:
#!pip install biopython
#        OR
#!pip3 install biopython

In [2]:
import os
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment
import queue

# Function to parse PDB file and extract secondary structure information
def parse_pdb(pdb_file):
    helix_records = []
    sheet_records = []

    with open(pdb_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        if line.startswith("HELIX"):
            helix_records.append({
                'beg_label_seq_id': int(line[21:25].strip()),
                'end_label_seq_id': int(line[33:37].strip()),
                'label_asym_id': line[19].strip(),
            })
        elif line.startswith("SHEET"):
            sheet_records.append({
                'beg_label_seq_id': int(line[22:26].strip()),
                'end_label_seq_id': int(line[33:37].strip()),
                'label_asym_id': line[21].strip(),
            })
    return helix_records, sheet_records

def map_secondary_structure_to_alignment(alignment, secondary_structure_dict):
    def is_in_region(residue_index, regions):
        for region in regions:
            beg_seq_id = int(region['beg_label_seq_id'])
            end_seq_id = int(region['end_label_seq_id'])
            if beg_seq_id <= residue_index <= end_seq_id:
                return True
        return False
    
    def color_chain(annotated_seq, start_index, end_index, color):
        for i in range(start_index, end_index + 1):
            annotated_seq[i] = color
        return annotated_seq

    ss_annotations = []

    for record in alignment:
        chain_id = record.id.split(':')[-1]  # Extract chain ID from sequence ID
        annotated_seq = []
        if chain_id in secondary_structure_dict:
            helix_records, sheet_records = secondary_structure_dict[chain_id]
            residue_index = 0  # Track the position of residues that are not dashes
            for i, residue in enumerate(record.seq):
                if residue != '-':
                    residue_index += 1
                    if is_in_region(residue_index, helix_records):
                        annotated_seq.append("#f4b899")  # Helix
                    elif is_in_region(residue_index, sheet_records):
                        annotated_seq.append("#87cfea")  # Sheet
                    else:
                        annotated_seq.append("#ffffff")  # Not in Helix or Sheet
                else:
                    annotated_seq.append("-")  # Leave dash line uncolored
        else:
            # If chain_id is not in the dictionary, mark all as "-"
            annotated_seq = ["-"] * len(record.seq)
        
        ss_annotations.append(annotated_seq)
    
    # Iterate over ss_annotations to merge consecutive dashes between the same colored regions
    for i in range(len(ss_annotations)):
        j = 0
        while j < len(ss_annotations[i]):
            if ss_annotations[i][j] == '-':
                start_chain = j
                while j < len(ss_annotations[i]) and ss_annotations[i][j] == '-':
                    j += 1
                end_chain = j - 1
                
                # Determine the color before and after the sequence of dashes
                if start_chain > 0:
                    prev_color = ss_annotations[i][start_chain - 1]
                else:
                    prev_color = None
                
                if end_chain < len(ss_annotations[i]) - 1:
                    next_color = ss_annotations[i][end_chain + 1]
                else:
                    next_color = None
                
                # If the dashes are between the same colored regions, color them accordingly
                if prev_color == next_color and prev_color is not None:
                    ss_annotations[i] = color_chain(ss_annotations[i], start_chain, end_chain, prev_color)
            else:
                j += 1
    
    return ss_annotations

def aln_to_html(aln_file, html_file, ss_annotations):
    # Read the alignment file
    with open(aln_file) as aln_handle:
        aln_lines = aln_handle.readlines()
        
    alignment = AlignIO.read(aln_file, "clustal")

    # Create multiple queues to store the ss_annotations
    sequence_names = []
    for record in alignment:
        sequence_names.append(record.id)
        
    # Create a dictionary with the key as sequence names and value as a queue of the ss_annotations
    annotation_dict = {}
    for i in range(len(sequence_names)):
        annotation_dict[sequence_names[i]] = queue.Queue()
        for j in range(len(ss_annotations[i])):
            annotation_dict[sequence_names[i]].put(ss_annotations[i][j])
    
    # Create the HTML output
    html_content = """
    <html>
    <head>
        <title>Alignment</title>
        <style>
            body { font-family: monospace; white-space: pre; }
            .star { color: red; }
            .colon { color: blue; }
            .dot { color: green; }
        </style>
    </head>
    <body>
    """

    def format_group(char, group):
        if char == '*':
            return "<span class='star'>" + ''.join(group) + "</span>"
        elif char == ':':
            return "<span class='colon'>" + ''.join(group) + "</span>"
        elif char == '.':
            return "<span class='dot'>" + ''.join(group) + "</span>"
        else:
            return ''.join(group)

    # Process each line and format it accordingly
    for line in aln_lines:
        formatted_line = []
        current_char = line[0] if line else ""
        current_group = [current_char]

        # Check if the line starts with a sequence name
        line_sequence_name = None
        for seq_name in sequence_names:
            if line.startswith(seq_name):
                line_sequence_name = seq_name
                break

        # If line starts with a sequence name, process it for annotations
        if line_sequence_name:
            # Find the first whitespace after the sequence name
            first_whitespace_index = line.find(' ', len(line_sequence_name))
            
            # Split the line into parts: sequence name, alignment, and character count
            name_part = line[:first_whitespace_index]
            rest_part = line[first_whitespace_index:]
            
            # Split the rest part to handle alignment and character count separately
            alignment_part = rest_part.rstrip().rsplit(maxsplit=1)
            
            if len(alignment_part) > 1:
                alignment_chars = alignment_part[0]
                char_count_part = alignment_part[1]
            else:
                alignment_chars = alignment_part[0]
                char_count_part = ""
            
            # Process alignment part character by character
            for char in alignment_chars:
                if char != ' ':
                    bg_color = annotation_dict[line_sequence_name].get() if not annotation_dict[line_sequence_name].empty() else 'white'
                    formatted_line.append(f"<span style='background-color: {bg_color}'>{char}</span>")
                else:
                    formatted_line.append(char)
            
            # Append the trailing whitespace and character count part without changing its background
            formatted_line.append(f" {char_count_part}")
            
            # Join the formatted line and add it to HTML content
            html_content += f"<div id='{line_sequence_name}'>{name_part}{''.join(formatted_line)}</div>\n"
        else:
            # Process line without sequence name normally
            for char in line[1:]:
                if char == current_char:
                    current_group.append(char)
                else:
                    formatted_line.append(format_group(current_char, current_group))
                    current_char = char
                    current_group = [char]
            
            # Append the last group
            formatted_line.append(format_group(current_char, current_group))
            
            # Add the formatted line to HTML content
            html_content += f"<div>{''.join(formatted_line)}</div>\n"

    # Close the HTML content
    html_content += """
    </body>
    </html>
    """

    # Write the HTML content to a file
    with open(html_file, "w") as output_handle:
        output_handle.write(html_content)

### Replace the path with the path of the file you want to read

In [3]:
# Directory containing PDB files
pdb_files_dir = "Replace_Me_With_PDB Folder"

# Read the multi-sequence alignment file could be .aln
alignment_file = "Replace_Me_With_ALN_File"

In [5]:
# List all PDB files in the directory
pdb_files = [os.path.join(pdb_files_dir, f) for f in os.listdir(pdb_files_dir) if f.endswith('.pdb')]

# Initialize a dictionary to store secondary structure annotations
secondary_structure_dict = {}

# Loop through each PDB file and parse the secondary structure information
for pdb_file in pdb_files:
    helix_records, sheet_records = parse_pdb(pdb_file)
    # Use the PDB file name (without extension) as the key
    secondary_structure_dict[os.path.basename(pdb_file).split('.')[0]] = (helix_records, sheet_records)

# Read the alignment
alignment = SeqIO.parse(alignment_file, "clustal")
alignment = MultipleSeqAlignment(alignment)

# Extract PDB file names from the alignment
pdb_file_names = [seq.id for seq in alignment]

# Verify that each PDB file name from the alignment has a corresponding entry in the secondary structure dictionary
missing_files = [name for name in pdb_file_names if name not in secondary_structure_dict]
if missing_files:
    print(f"Warning: The following PDB files are missing or could not be parsed: {missing_files}")

# Map secondary structure annotations to the alignment
ss_annotations = map_secondary_structure_to_alignment(alignment, secondary_structure_dict)

# Generate the HTML representation
output_file = "OutputALNformat.html"
aln_to_html("alignment_with_staranddot_and_charactercount.aln", output_file, ss_annotations)