In [2]:
!pip install biopython joblib

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [3]:
# ==============================================================================
# DATA PREP: ROBUST LOCAL PDB PROCESSING (Fixes Nested Folders)
# ==============================================================================
import os
import numpy as np
from Bio.PDB import PDBParser
from scipy.spatial.distance import pdist, squareform
from joblib import Parallel, delayed
import warnings

# --- CONFIGURATION ---
# We point to the ROOT of the dataset. The script will find the files inside.
# This path matches the standard mount point for the dataset in your screenshot.
DATASET_ROOT = "/kaggle/input/rcsb-pdb-human-macromolecular-structure-data"
OUTPUT_FILE = "protein_dataset_human_128x128.npz"

IMG_SIZE = 128
MIN_LEN = 40
MAX_LEN = 128

warnings.filterwarnings('ignore')

# --- 1. FILE FINDER (The Fix) ---
def find_all_pdb_files(root_dir):
    pdb_files = []
    print(f"üìÇ Scanning {root_dir} for .pdb files...")
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".pdb") or file.endswith(".ent"):
                full_path = os.path.join(root, file)
                pdb_files.append(full_path)
    return pdb_files

# --- 2. PROCESSING FUNCTION ---
def process_local_pdb(file_path, img_size):
    parser = PDBParser(QUIET=True)
    try:
        # Extract ID from filename (e.g., "1a2b.pdb")
        pdb_id = os.path.basename(file_path).split('.')[0]
        
        structure = parser.get_structure(pdb_id, file_path)
        ca_coords = []
        
        # Extract CA atoms
        for model in structure:
            for chain in model:
                for residue in chain:
                    if 'CA' in residue:
                        ca_coords.append(residue['CA'].get_coord())
            break # First model only
        
        ca_coords = np.array(ca_coords)
        seq_len = len(ca_coords)
        
        # Filter size
        if seq_len < MIN_LEN or seq_len > img_size:
            return None 

        # Compute Distogram (Continuous Angstroms)
        dist_matrix = squareform(pdist(ca_coords))
        
        # Pad
        final_map = np.zeros((img_size, img_size), dtype=np.float32)
        final_map[:seq_len, :seq_len] = dist_matrix
        
        # Mask
        mask = np.zeros((img_size, img_size), dtype=np.float32)
        mask[:seq_len, :seq_len] = 1.0
        
        return (final_map, mask, pdb_id)

    except Exception:
        return None

# --- 3. EXECUTION ---
if __name__ == "__main__":
    # A. Find Files
    all_files = find_all_pdb_files(DATASET_ROOT)
    print(f"‚úÖ Found {len(all_files)} PDB files in total.")
    
    if len(all_files) == 0:
        print("‚ùå CRITICAL ERROR: No PDB files found. Check if dataset is added via 'Add Data'.")
    else:
        # B. Process in Parallel
        print(f"üöÄ Processing {len(all_files)} structures...")
        results = Parallel(n_jobs=-1, verbose=1)(
            delayed(process_local_pdb)(f, IMG_SIZE) for f in all_files
        )

        # C. Save
        valid_results = [r for r in results if r is not None]
        print(f"üéâ Successfully processed {len(valid_results)} valid proteins.")

        if len(valid_results) > 0:
            maps = np.array([r[0] for r in valid_results])
            masks = np.array([r[1] for r in valid_results])
            ids = np.array([r[2] for r in valid_results])
            
            np.savez_compressed(OUTPUT_FILE, contact_maps=maps, masks=masks, pdb_ids=ids)
            print(f"üíæ Dataset saved to: {OUTPUT_FILE}")
            print("üëâ Next Step: Create a Kaggle Dataset from this file and use it in Phase 1.")

üìÇ Scanning /kaggle/input/rcsb-pdb-human-macromolecular-structure-data for .pdb files...
‚úÖ Found 0 PDB files in total.
‚ùå CRITICAL ERROR: No PDB files found. Check if dataset is added via 'Add Data'.


In [6]:
# ==============================================================================
# DATA PREP: HYBRID APPROACH (Use CSV for IDs -> Download PDBs)
# ==============================================================================
!pip install biopython joblib

import os
import pandas as pd
import numpy as np
from Bio.PDB import PDBList, PDBParser
from scipy.spatial.distance import pdist, squareform
from joblib import Parallel, delayed
import warnings

# --- CONFIGURATION ---
# Path to the CSV file in the dataset you added
CSV_PATH = "/kaggle/input/datasets/samiraalipour/rcsb-pdb-macromolecular-structure-dataset/RCSB_PDB_Macromolecular_Structure_Dataset.csv"
OUTPUT_FILE = "protein_dataset_human_128x128.npz"
TEMP_DOWNLOAD_DIR = "./pdb_downloads" # Temporary folder for downloads

IMG_SIZE = 128
MIN_LEN = 40
MAX_LEN = 128

os.makedirs(TEMP_DOWNLOAD_DIR, exist_ok=True)
warnings.filterwarnings('ignore')

# --- 1. GET VALID IDs FROM CSV ---
def get_clean_ids_from_csv(csv_path):
    print(f"üìÑ Reading metadata from {csv_path}...")
    try:
        df = pd.read_csv(csv_path)
        # The column is usually 'structureId' or 'pdb_id'
        if 'structureId' in df.columns:
            ids = df['structureId'].unique().tolist()
        elif 'pdb_id' in df.columns:
            ids = df['pdb_id'].unique().tolist()
        else:
            # Fallback: look for any 4-letter column
            print("‚ö†Ô∏è 'structureId' column not found. Searching...")
            ids = df.iloc[:, 0].unique().tolist() # Assume first col is ID
            
        print(f"‚úÖ Found {len(ids)} high-quality human proteins in CSV.")
        return ids
    except Exception as e:
        print(f"‚ùå Error reading CSV: {e}")
        return []

# --- 2. DOWNLOAD & PROCESS WORKER ---
def download_and_process(pdb_id, save_dir, img_size):
    pdbl = PDBList(verbose=False)
    parser = PDBParser(QUIET=True)
    
    try:
        # A. Download
        f_path = pdbl.retrieve_pdb_file(pdb_id, pdir=save_dir, file_format="pdb")
        if not os.path.exists(f_path): return None

        # B. Parse
        structure = parser.get_structure(pdb_id, f_path)
        ca_coords = []
        for model in structure:
            for chain in model:
                for residue in chain:
                    if 'CA' in residue:
                        ca_coords.append(residue['CA'].get_coord())
            break # First model only
        
        ca_coords = np.array(ca_coords)
        seq_len = len(ca_coords)
        
        # C. SMART CROPPING (The Fix)
        # If too small, discard.
        if seq_len < MIN_LEN:
            if os.path.exists(f_path): os.remove(f_path)
            return None
            
        # If too big, cut it into 128-sized chunks!
        # This turns 1 big protein into MULTIPLE training samples.
        samples = []
        
        # Stride = 64 (50% overlap) to get more data
        stride = 64 
        
        num_crops = 0
        if seq_len > img_size:
            for start in range(0, seq_len - img_size + 1, stride):
                crop = ca_coords[start : start + img_size]
                samples.append(crop)
                num_crops += 1
                # Limit to 5 crops per protein to prevent bias
                if num_crops >= 5: break 
        else:
            # If it fits perfectly or is smaller than 128 but > 40
            samples.append(ca_coords)

        # D. Process All Crops
        processed_data = []
        for coords in samples:
            # Distance Matrix
            dist_matrix = squareform(pdist(coords))
            curr_len = len(coords)
            
            # Pad
            final_map = np.zeros((img_size, img_size), dtype=np.float32)
            final_map[:curr_len, :curr_len] = dist_matrix
            
            # Mask
            mask = np.zeros((img_size, img_size), dtype=np.float32)
            mask[:curr_len, :curr_len] = 1.0
            
            processed_data.append((final_map, mask))

        # Clean up file
        if os.path.exists(f_path): os.remove(f_path)
        
        # Return list of samples (not just one)
        return (processed_data, pdb_id)

    except Exception:
        return None
        
# --- 3. EXECUTION ---
if __name__ == "__main__":
    pdb_ids = get_clean_ids_from_csv(CSV_PATH)
    
    if len(pdb_ids) > 0:
        print(f"üöÄ Starting processing of {len(pdb_ids)} proteins with SLIDING WINDOW...")
        
        results = Parallel(n_jobs=-1, verbose=1)(
            delayed(download_and_process)(pid, TEMP_DOWNLOAD_DIR, IMG_SIZE) for pid in pdb_ids
        )

        # Flatten the results (since one protein can now give multiple samples)
        all_maps = []
        all_masks = []
        all_ids = []
        
        for r in results:
            if r is not None:
                samples, pid = r
                for (m, mask) in samples:
                    all_maps.append(m)
                    all_masks.append(mask)
                    all_ids.append(pid)
        
        print(f"üéâ Successfully extracted {len(all_maps)} training samples!")

        if len(all_maps) > 0:
            # Convert to arrays
            # Caution: large lists can be slow to convert. Do it carefully.
            maps_arr = np.array(all_maps, dtype=np.float32)
            masks_arr = np.array(all_masks, dtype=np.float32)
            ids_arr = np.array(all_ids)
            
            np.savez_compressed(OUTPUT_FILE, contact_maps=maps_arr, masks=masks_arr, pdb_ids=ids_arr)
            print(f"üíæ Dataset saved to: {OUTPUT_FILE}")
        else:
            print("‚ùå Still no data. Check internet connection?")

üìÑ Reading metadata from /kaggle/input/datasets/samiraalipour/rcsb-pdb-macromolecular-structure-dataset/RCSB_PDB_Macromolecular_Structure_Dataset.csv...
‚ö†Ô∏è 'structureId' column not found. Searching...
‚úÖ Found 11832 high-quality human proteins in CSV.
üöÄ Starting processing of 11832 proteins with SLIDING WINDOW...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 18.3min


Desired structure not found or download failed. '9eu8': HTTP Error 404: Not Found
Desired structure not found or download failed. '9euc': HTTP Error 404: Not Found
Desired structure not found or download failed. '9bfy': HTTP Error 404: Not Found
Desired structure not found or download failed. '7a6w': HTTP Error 404: Not Found
Desired structure not found or download failed. '9f9m': HTTP Error 404: Not Found
Desired structure not found or download failed. '8c12': HTTP Error 404: Not Found
Desired structure not found or download failed. '8q7g': HTTP Error 404: Not Found
Desired structure not found or download failed. '6tjj': HTTP Error 404: Not Found
Desired structure not found or download failed. '6tjk': HTTP Error 404: Not Found
Desired structure not found or download failed. '7qsi': HTTP Error 404: Not Found
Desired structure not found or download failed. '7orq': HTTP Error 404: Not Found
Desired structure not found or download failed. '9flc': HTTP Error 404: Not Found
Desired structur

[Parallel(n_jobs=-1)]: Done 11242 tasks      | elapsed: 21.0min


Desired structure not found or download failed. '9eu9': HTTP Error 404: Not Found
Desired structure not found or download failed. '9eud': HTTP Error 404: Not Found
Desired structure not found or download failed. '9bg1': HTTP Error 404: Not Found
Desired structure not found or download failed. '9ey4': HTTP Error 404: Not Found
Desired structure not found or download failed. '9f9a': HTTP Error 404: Not Found
Desired structure not found or download failed. '9bc4': HTTP Error 404: Not Found
Desired structure not found or download failed. '8bjx': HTTP Error 404: Not Found
Desired structure not found or download failed. '5lvs': HTTP Error 404: Not Found
Desired structure not found or download failed. '8d35': HTTP Error 404: Not Found
Desired structure not found or download failed. '7nh6': HTTP Error 404: Not Found
Desired structure not found or download failed. '7qse': HTTP Error 404: Not Found
Desired structure not found or download failed. '7qnv': HTTP Error 404: Not Found
Desired structur

[Parallel(n_jobs=-1)]: Done 11832 out of 11832 | elapsed: 22.1min finished


üéâ Successfully extracted 43179 training samples!
üíæ Dataset saved to: protein_dataset_human_128x128.npz
