In [1]:
import os

def extract_sequence_from_atom(pdb_file):
    try:
        with open(pdb_file, 'r') as file:
            lines = file.readlines()
    except FileNotFoundError:
        print(f"File not found: {pdb_file}")
        return {}
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return {}

    sequence_dict = {}
    seen_residues = {}
    for line in lines:
        if line.startswith("ATOM"):
            chain_id = line[21].strip()
            residue_name = line[17:20].strip()
            residue_number = int(line[22:26].strip())
            if chain_id not in sequence_dict:
                sequence_dict[chain_id] = []
                seen_residues[chain_id] = set()
            if (residue_name, residue_number) not in seen_residues[chain_id]:
                sequence_dict[chain_id].append(residue_name)
                seen_residues[chain_id].add((residue_name, residue_number))

    return sequence_dict

def main():
    pdb_file_path = "original_humancys.pdb"
    if not os.path.isfile(pdb_file_path):
        print(f"Invalid file path: {pdb_file_path}")
        return

    print(f"Reading file: {pdb_file_path}")
    sequence = extract_sequence_from_atom(pdb_file_path)

    if not sequence:
        print("No ATOM records found or file could not be read.")
    else:
        for chain, residues in sequence.items():
            print(f"Chain {chain}: {' '.join(residues)}")

if __name__ == "__main__":
    main()


Reading file: original_humancys.pdb
Chain X: PRO GLN ASN ARG LYS VAL ASP CYX ASN LYS GLY VAL THR GLY THR ILE TYR GLU TYR GLY ALA LEU THR LEU ASN GLY GLU GLU TYR ILE GLN PHE LYS GLN PHE ALA GLY LYS HID VAL LEU PHE VAL ASN VAL ALA ALA TYR CYS GLY LEU ALA ALA GLN TYR PRO GLU LEU ASN ALA LEU GLN GLU GLU LEU LYS ASN PHE GLY VAL ILE VAL LEU ALA PHE PRO CYS ASN GLN PHE GLY LYS GLN GLU PRO GLY THR ASN SER GLU ILE LEU LEU GLY LEU LYS TYR VAL CYS PRO GLY SER GLY PHE VAL PRO SER PHE GLN LEU PHE GLU LYS GLY ASP VAL ASN GLY GLU LYS GLU GLN LYS VAL PHE THR PHE LEU LYS ASN SER CYX PRO PRO THR SER ASP LEU LEU GLY SER SER SER GLN LEU PHE TRP GLU PRO MET LYS VAL HID ASP ILE ARG TRP ASN PHE GLU LYS PHE LEU VAL GLY PRO ASP GLY VAL PRO VAL MET HID TRP PHE HID GLN ALA PRO VAL SER THR VAL LYS SER ASP ILE LEU GLU TYR LEU LYS GLN PHE ASN THR HIE
