In [2]:
#!/usr/bin/env python3
"""
Extract protein sequences from PDB files and save as individual FASTA files
Configured for your specific paths
"""

import os
from pathlib import Path
from collections import defaultdict

class PDBSequenceExtractor:
    """Extract protein sequences from PDB files."""
    
    # Standard amino acid three-letter to one-letter code
    AA_CODES = {
        'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
        'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
        'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
        'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
        'MSE': 'M',  # Selenomethionine
        'SEC': 'U',  # Selenocysteine
        'PYL': 'O',  # Pyrrolysine
    }
    
    def parse_pdb_file(self, pdb_file):
        """
        Parse a single PDB file and extract sequences.
        
        Args:
            pdb_file: Path to PDB file
            
        Returns:
            dict: {chain_id: sequence}
        """
        sequences = defaultdict(list)
        seen_residues = defaultdict(set)
        
        try:
            with open(pdb_file, 'r') as f:
                for line in f:
                    if line.startswith('ATOM'):
                        try:
                            atom_name = line[12:16].strip()
                            res_name = line[17:20].strip()
                            chain_id = line[21].strip()
                            res_seq = line[22:26].strip()
                            
                            # Only process CA atoms
                            if atom_name != 'CA':
                                continue
                            
                            # Skip if not a standard amino acid
                            if res_name not in self.AA_CODES:
                                continue
                            
                            # Avoid duplicate residues
                            residue_key = (chain_id, res_seq)
                            if residue_key in seen_residues[chain_id]:
                                continue
                            
                            seen_residues[chain_id].add(residue_key)
                            sequences[chain_id].append((int(res_seq), self.AA_CODES[res_name]))
                            
                        except (ValueError, IndexError):
                            continue
        
        except FileNotFoundError:
            print(f"Error: File not found - {pdb_file}")
            return {}
        except Exception as e:
            print(f"Error reading {pdb_file}: {e}")
            return {}
        
        # Sort by residue number and join
        result = {}
        for chain_id, residues in sequences.items():
            residues.sort(key=lambda x: x[0])
            result[chain_id if chain_id else 'A'] = ''.join(r[1] for r in residues)
        
        return result
    
    def write_single_fasta(self, sequence, output_file, header):
        """
        Write a single sequence to FASTA format.
        
        Args:
            sequence: Protein sequence string
            output_file: Output FASTA file path
            header: FASTA header (without >)
        """
        with open(output_file, 'w') as f:
            f.write(f">{header}\n")
            # Write sequence in 60-character lines
            for i in range(0, len(sequence), 60):
                f.write(f"{sequence[i:i+60]}\n")


def main():
    # Define paths - UPDATE THESE IF DIFFERENT ON YOUR SYSTEM
    pdb_directory = Path("/home/hp/results/MOUSE/mutant_pdbs")
    output_directory = Path("/home/hp/nayanika/github/GPX6/analysis/alignment/MOUSE")
    
    # Create output directory if it doesn't exist
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Initialize extractor
    extractor = PDBSequenceExtractor()
    
    # Get all PDB files
    pdb_files = sorted(pdb_directory.glob("*.pdb"))
    
    if not pdb_files:
        print(f"ERROR: No PDB files found in {pdb_directory}")
        print(f"Please check if the directory exists and contains .pdb files")
        return
    
    print(f"Found {len(pdb_files)} PDB files")
    print(f"Extracting sequences and saving to {output_directory}")
    print("=" * 70)
    
    success_count = 0
    failed_files = []
    
    for pdb_file in pdb_files:
        # Extract sequences
        sequences = extractor.parse_pdb_file(pdb_file)
        
        if not sequences:
            print(f"‚ùå No sequences found in {pdb_file.name}")
            failed_files.append(pdb_file.name)
            continue
        
        # Get base name without extension
        base_name = pdb_file.stem
        
        # If multiple chains, save each separately
        if len(sequences) == 1:
            # Single chain - save with base name
            chain_id, sequence = next(iter(sequences.items()))
            output_file = output_directory / f"{base_name}.fasta"
            header = f"{base_name}_chain{chain_id}"
            extractor.write_single_fasta(sequence, output_file, header)
            print(f"‚úì {pdb_file.name:30s} ‚Üí {output_file.name:35s} ({len(sequence):3d} residues)")
            success_count += 1
        else:
            # Multiple chains - save each with chain identifier
            for chain_id, sequence in sequences.items():
                output_file = output_directory / f"{base_name}_chain{chain_id}.fasta"
                header = f"{base_name}_chain{chain_id}"
                extractor.write_single_fasta(sequence, output_file, header)
                print(f"‚úì {pdb_file.name:30s} chain {chain_id} ‚Üí {output_file.name:30s} ({len(sequence):3d} residues)")
            success_count += 1
    
    print("=" * 70)
    print(f"\nüìä Summary:")
    print(f"  Successfully processed: {success_count}/{len(pdb_files)} files")
    if failed_files:
        print(f"  Failed files: {', '.join(failed_files)}")
    print(f"\nüíæ Output saved to: {output_directory}")
    print(f"\nDone! All FASTA files have been created.")


if __name__ == '__main__':
    main()

Found 20 PDB files
Extracting sequences and saving to /home/hp/nayanika/github/GPX6/analysis/alignment/MOUSE
‚úì GPX6_level01.pdb               ‚Üí GPX6_level01.fasta                  (188 residues)
‚úì GPX6_level02.pdb               ‚Üí GPX6_level02.fasta                  (188 residues)
‚úì GPX6_level03.pdb               ‚Üí GPX6_level03.fasta                  (188 residues)


‚úì GPX6_level04.pdb               ‚Üí GPX6_level04.fasta                  (188 residues)
‚úì GPX6_level05.pdb               ‚Üí GPX6_level05.fasta                  (188 residues)
‚úì GPX6_level06.pdb               ‚Üí GPX6_level06.fasta                  (188 residues)
‚úì GPX6_level07.pdb               ‚Üí GPX6_level07.fasta                  (188 residues)
‚úì GPX6_level08.pdb               ‚Üí GPX6_level08.fasta                  (189 residues)
‚úì GPX6_level09.pdb               ‚Üí GPX6_level09.fasta                  (190 residues)
‚úì GPX6_level10.pdb               ‚Üí GPX6_level10.fasta                  (191 residues)
‚úì GPX6_level11.pdb               ‚Üí GPX6_level11.fasta                  (191 residues)
‚úì GPX6_level12.pdb               ‚Üí GPX6_level12.fasta                  (191 residues)
‚úì GPX6_level13.pdb               ‚Üí GPX6_level13.fasta                  (191 residues)
‚úì GPX6_level14.pdb               ‚Üí GPX6_level14.fasta                  (191 residues)
‚úì GPX6_l