In [1]:
def parse_clustal_alignment(alignment_text):
    """Parse CLUSTAL alignment and extract sequences."""
    sequences = {}
    for line in alignment_text.strip().split('\n'):
        if line and not line.startswith('CLUSTAL') and not line.startswith(' ') and line.strip():
            parts = line.split()
            if len(parts) >= 2 and parts[0].startswith('protein_'):
                seq_name = parts[0]
                seq_fragment = parts[1]
                if seq_name not in sequences:
                    sequences[seq_name] = ""
                sequences[seq_name] += seq_fragment
    return sequences

def calculate_identity(seq1, seq2):
    """Calculate percentage identity between two sequences, ignoring gaps and unknown residues."""
    if len(seq1) != len(seq2):
        raise ValueError("Sequences must be aligned (same length)")
    
    matches = 0
    valid_positions = 0
    
    for a, b in zip(seq1, seq2):
        # Skip gaps and unknown residues (X, U, etc.)
        if a in '-X' or b in '-XU':
            continue
        valid_positions += 1
        if a == b:
            matches += 1
    
    if valid_positions == 0:
        return 0.0
    
    return (matches / valid_positions) * 100

def find_closest_to_ancestor(sequences):
    """Determine which sequence is closer to the ancestor."""
    ancestor = sequences.get('protein_ancestor', '')
    mouse = sequences.get('protein_mousecys', '')
    human = sequences.get('protein_humansec', '')
    
    if not ancestor:
        raise ValueError("Ancestor sequence not found")
    
    # Calculate identities
    mouse_identity = calculate_identity(ancestor, mouse)
    human_identity = calculate_identity(ancestor, human)
    
    # Calculate differences (number of mismatches)
    mouse_diff = sum(1 for a, m in zip(ancestor, mouse) 
                     if a not in '-X' and m not in '-XU' and a != m)
    human_diff = sum(1 for a, h in zip(ancestor, human) 
                     if a not in '-X' and h not in '-XU' and a != h)
    
    return {
        'mouse_identity': mouse_identity,
        'human_identity': human_identity,
        'mouse_differences': mouse_diff,
        'human_differences': human_diff,
        'closer_sequence': 'human' if human_identity > mouse_identity else 'mouse'
    }

# Your alignment data
alignment = """
protein_mousecys    PQKSKVDXNKGVTGTVYEYGANTIDGGEFVNFQQYAGKXILFVNVASFCGLTATYPELNT	60
protein_ancestor    SQKMKMDCYKGVTGTIYEYGALTLNGEEYIQFKQYAGKHVLFINVATY-GLTAQYPELNA	59
protein_humansec    PQNRKVDXNKGVTGTIYEYGALTLNGEEYIQFKQFAGKXVLFVNVAAYUGLAAQYPELNA	60

protein_mousecys    LQEELKPFNVTVLGFPCNQFGKQEPGKNSEILLGLKYVRPGGGYVPNFQLFEKGDVNGDN	120
protein_ancestor    LQEELKPFGVVLLGFPCNQFGKQEPGKNSEILSGLKYVRPGGGFVPNFQLFEKGDVNGEK	119
protein_humansec    LQEELKNFGVIVLAFPCNQFGKQEPGTNSEILLGLKYVCPGSGFVPSFQLFEKGDVNGEK	120

protein_mousecys    EQKVFSFLKNSXPPTSELFGSPEXLFWDPMKVXDIRWNFEKFLVGPDGVPVMRWFXXTPV	180
protein_ancestor    EQKVFTFLKNSCPPTSDLLGSPKQLFWEPMKVHDIRWNFEKFLVGPDGVPVMRWFHRASV	179
protein_humansec    EQKVFTFLKNSXPPTSDLLGSSSQLFWEPMKVXDIRWNFEKFLVGPDGVPVMXWFXQAPV	180

protein_mousecys    RIVQSDIMEYLNQTS--	195
protein_ancestor    STVKSDILEYLKQFTPE	196
protein_humansec    STVKSDILEYLKQFNTX	197
"""

# Run analysis
sequences = parse_clustal_alignment(alignment)
results = find_closest_to_ancestor(sequences)

print(f"Mouse sequence identity to ancestor: {results['mouse_identity']:.2f}%")
print(f"Human sequence identity to ancestor: {results['human_identity']:.2f}%")
print(f"\nMouse differences from ancestor: {results['mouse_differences']}")
print(f"Human differences from ancestor: {results['human_differences']}")
print(f"\nCloser sequence to ancestor: {results['closer_sequence'].upper()}")

Mouse sequence identity to ancestor: 78.61%
Human sequence identity to ancestor: 87.30%

Mouse differences from ancestor: 40
Human differences from ancestor: 24

Closer sequence to ancestor: HUMAN
