# Clean pdbs

In [None]:
with open(os.path.join(BASE_DIR, "cifdownload.log"), "r") as r:
    lines = r.readlines()

fails = [line for line in lines if "Failed" in line]
fails

### Download the failed files manually.

In [None]:
# NOTE: This was generated by Gemini 2.5 Pro, sorryyyy

import os
import gzip
from Bio.PDB import MMCIFParser, PDBIO, Select
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings

# Assume DATA_RAW_DIR, DATA_DIR and pdbs_to_download are defined
# For example:
# DATA_RAW_DIR = "path/to/your/DATA_RAW_DIR"
# DATA_DIR = "path/to/your/DATA_DIR"
# pdbs_to_download = {
# 'CTR1': {'Active': ['3PPZB'], 'Inactive': ['3P86B']},
# 'GRK1': {'Active': ['4L9IB',   '3T8OA',   '3C4WA',   '3C4WB',   '3C4XA',   '3C4XB',   '3C4ZA',   '3C50A'],
# 'Inactive': ['3C4YA',   '4L9IA',   '4WBOA',   '4WBOB',   '4WBOC',   '4WBOD',   '7MT9G',   '7MTAG']}
# }

# Custom Select class to filter for a specific chain
class ChainSelect(Select):
    """
    Custom Bio.PDB.Select class to select a specific chain.
    """
    def __init__(self, chain_id_to_select):
        # Ensure chain ID is uppercase for consistent matching
        self.chain_id_to_select = chain_id_to_select.upper()

    def accept_model(self, model):
        # Accept all models (usually there's one in crystal structures)
        return 1

    def accept_chain(self, chain):
        # Accept the chain if its ID matches the target chain ID
        if chain.get_id().upper() == self.chain_id_to_select:
            return 1
        else:
            return 0

    def accept_residue(self, residue):
        # Accept all residues within the selected chain
        return 1

    def accept_atom(self, atom):
        # Accept all atoms within the selected residues
        return 1

# Suppress common PDBConstructionWarning if they are not critical for your task.
# These warnings often relate to minor discrepancies in PDB files.
warnings.simplefilter('ignore', PDBConstructionWarning)

# Start of your existing code structure
for protein, inner_d in pdbs_to_download.items():
    print(f"Processing {protein}...")

    prot_dir = os.path.join(DATA_DIR, protein)
    os.makedirs(prot_dir, exist_ok=True)

    for label, pdb_id_list in inner_d.items():
        print(f"\t{label} has {len(pdb_id_list)} PDBs")

        output_dir = os.path.join(prot_dir, label)
        os.makedirs(output_dir, exist_ok=True)

        for pdb_id_entry in pdb_id_list: # e.g., "3PPZB"
            print(f"\t\tProcessing {pdb_id_entry}...")

            # Extract PDB code (e.g., "3PPZ") and target chain ID (e.g., "B")
            target_chain_id = pdb_id_entry[-1]
            pdb_code = pdb_id_entry[:-1]

            # Construct the full path to the gzipped CIF file
            # Based on your example: "3PPZ.cif.gz" from "3PPZB"
            gz_cif_file = os.path.join(DATA_RAW_DIR, f"{pdb_code}.cif.gz")

            # Define the output PDB file name and full path
            # Based on your example: "3PPZ_chainB.pdb"
            output_pdb_filename = f"{pdb_code}_chain{target_chain_id}.pdb"
            output_pdb_file_path = os.path.join(output_dir, output_pdb_filename)

            if not os.path.exists(gz_cif_file):
                print(f"\t\t\tWARNING: Source file {gz_cif_file} not found. Skipping {pdb_id_entry}.")
                continue

            try:
                # Initialize the MMCIFParser
                parser = MMCIFParser() # QUIET=True can be added but warnings.simplefilter is more general

                # Unzip and parse the CIF file
                # 'rt' mode for reading as text; CIF is a text-based format
                with gzip.open(gz_cif_file, 'rt', encoding='utf-8') as cif_file_handle:
                    # The first argument to get_structure is a structure ID (can be arbitrary, e.g., pdb_code)
                    structure = parser.get_structure(pdb_code, cif_file_handle)

                # Initialize PDBIO to write the PDB file
                pdb_io = PDBIO()
                pdb_io.set_structure(structure) # Set the full parsed structure

                # Save only the specified chain using the ChainSelect class
                pdb_io.save(output_pdb_file_path, ChainSelect(target_chain_id))

                print(f"\t\t\tSuccessfully extracted chain {target_chain_id} from {pdb_code} to {output_pdb_file_path}")

            except FileNotFoundError: # Should be caught by os.path.exists, but as a safeguard
                print(f"\t\t\tERROR: File {gz_cif_file} might have been removed or unzipping failed for {pdb_id_entry}. Skipping.")
            except Exception as e:
                print(f"\t\t\tERROR processing {pdb_id_entry} (from file {gz_cif_file}): {e}")
                # Depending on your needs, you might want to re-raise the exception
                # or log more details.
                # raise

print("\nAll PDB processing finished.")