# **ZincSight**: Interpretable prediction of zinc ion locations in proteins

‚ö†Ô∏è **Important Instructions:**

- If ZincSight crashes, please reset the runtime manually:
  Go to `Runtime` ‚Üí `Disconnect and delete runtime`, then refresh the page and try again.
  <p></p>
- If the issue continues, contact **mechtinger1@mail.tau.ac.il** and include the **protein IDs or structures** you used for reproduction and debugging.
  <p></p>
- For maximum speed, change the hardware accelerator: Go to `Runtime` ‚Üí `Change runtime type` ‚Üí select `TPU`.
  <p></p>



In [None]:
#@title üß¨ ZincSight: Structure Input & Configuration {display-mode: "form"}

#@markdown **Enter your protein(s)** in the field below or upload **PDB** or **mmCIF** structures.
#@markdown - Query IDs: **PDB** (ASU or biological assembly), **AlphaFoldDB** (UniProt or Model ID), **ESM Metagenomic Atlas**, or **TED Domains**.
#@markdown - To upload your own structure files (including `.tar.gz`), check the appropriate box below.
#@markdown - **Note:** All input folders will be flattened (all files moved to the root query folder).
#@markdown - **Drive import auto-detect:** folder/archive => structures; .txt/.csv/.tsv => ID list; .pdb/.cif/.ent(.gz) => structure file.

# --- UI CONTROLS ---

#@markdown ### üîë 1. Enter Identifiers
structure_ids = "" #@param {type:"string"}

#@markdown ---
#@markdown ### üìÇ 2. Import Local Files
upload_structures = False #@param {type:"boolean"}
upload_tar_gz = False #@param {type:"boolean"}
upload_id_list_txt = True #@param {type:"boolean"}

#@markdown ---
#@markdown ### ‚òÅÔ∏è 3. Import from Google Drive
import_from_google_drive = True # @param {"type":"boolean"}
drive_path = "" #@param {type:"string"}
#@markdown *Path relative to 'My Drive'. Can be a file, folder, or .tar.gz archive.*
#@markdown *Drive import auto-detect:*
#@markdown - **Folder** ‚Üí **structure files** (copied + flattened into `/content/query_structures`)
#@markdown - **.tar.gz / .tgz** ‚Üí **structures archive** (extracted + flattened)
#@markdown - **.pdb / .cif / .pdb.gz / .cif.gz** ‚Üí **single structure file** (copied)
#@markdown - **.txt/ .csv / .tsv** ‚Üí **ID list** (comma/whitespace-separated)
#@markdown ---
#@markdown ### ‚öôÔ∏è 4. Settings
initialize_fresh_query = True #@param {type:"boolean"}
include_histidine_rotamers = False #@param {type: "boolean"}
create_pymol_sessions = False #@param {type: "boolean"}

#@markdown ---
#@markdown ### üìÅ 5. Output folder name
name_of_output_folder = "" #@param {type:"string"}
#@markdown *If relative, it will be created under MyDrive. If absolute (e.g., /content/results), it will be created there.*

# --- IMPLEMENTATION ---

import re, shutil, tarfile, os
from pathlib import Path
from google.colab import files, drive

QUERY_DIR = Path("/content/query_structures")

def setup_environment():
    """Wipes and recreates the query directory if requested."""
    if initialize_fresh_query and QUERY_DIR.exists():
        print("üßπ Initializing query: clearing previous files.")
        shutil.rmtree(QUERY_DIR)
    QUERY_DIR.mkdir(parents=True, exist_ok=True)

def flatten_directory():
    """Moves all files from subdirectories to the root QUERY_DIR and removes empty dirs."""
    # Move files up
    for p in list(QUERY_DIR.rglob("*")):
        if p.is_file() and p.parent != QUERY_DIR:
            dest = QUERY_DIR / p.name
            # Overwrite if exists (simple behavior)
            try:
                if dest.exists():
                    dest.unlink()
            except Exception:
                pass
            shutil.move(str(p), str(dest))

    # Remove empty directories
    for p in sorted(list(QUERY_DIR.rglob("*")), reverse=True):
        if p.is_dir() and p != QUERY_DIR:
            try:
                p.rmdir()
            except OSError:
                pass

def extract_archive(file_path: Path):
    """Extracts tar.gz/tgz and flattens hierarchy."""
    print(f"üì¶ Extracting: {file_path.name}...")
    try:
        with tarfile.open(file_path, "r:gz") as tar:
            tar.extractall(path=QUERY_DIR)
        flatten_directory()
    except Exception as e:
        print(f"‚ùå Extraction error: {e}")

def parse_ids(text: str):
    """Tokenizes string input into a list of IDs (comma/whitespace separated)."""
    return [t for t in re.split(r"[\s,]+", text.strip()) if t]

def ensure_output_dir(path_str: str) -> Path:
    """
    Creates output directory if it doesn't exist.
    - If absolute (starts with /), it is used as-is.
    - If relative, it is created under /content/drive/MyDrive/<path_str>.
    """
    if not path_str or not path_str.strip():
        raise ValueError("name_of_output_folder is empty")

    p = Path(path_str.strip())
    if not p.is_absolute():
        p = Path("/content/drive/MyDrive") / p

    p.mkdir(parents=True, exist_ok=True)
    return p

def handle_drive_import(rel_path: str, final_ids: list):
    """
    Auto-detects Drive content:
      - folder => copy all files (structures) + flatten
      - .tar.gz/.tgz => extract (structures) + flatten
      - .txt/.csv/.tsv => parse IDs into final_ids
      - .pdb/.cif/.ent(.gz) => copy as structure
      - otherwise => copy and warn
    """
    drive.mount("/content/drive", force_remount=False)
    src = Path("/content/drive/MyDrive") / rel_path.lstrip("/")

    if not src.exists():
        print(f"‚ùå Not found on Drive: {src}")
        return final_ids

    # Archives of structures
    if src.name.endswith((".tar.gz", ".tgz")):
        extract_archive(src)
        return final_ids

    # Folder of structures
    if src.is_dir():
        print(f"üìÇ Importing structure files from Drive folder: {src} (Flattening...)")
        count = 0
        for f in src.rglob("*"):
            if f.is_file():
                shutil.copy2(f, QUERY_DIR)
                count += 1
        print(f"   -> Copied {count} files.")
        flatten_directory()
        return final_ids

    # Single file
    suffix = src.suffix.lower()
    name_lower = src.name.lower()

    # ID list files
    if suffix in {".txt", ".csv", ".tsv"}:
        text = src.read_text(encoding="utf-8", errors="replace")
        new_ids = parse_ids(text)
        final_ids.extend(new_ids)
        print(f"üßæ Parsed IDs from Drive file: {src.name} -> +{len(new_ids)} IDs")
        return final_ids

    # Structure files (including .pdb.gz etc.)
    if suffix in {".pdb", ".cif", ".ent"} or name_lower.endswith((".pdb.gz", ".cif.gz", ".ent.gz")):
        shutil.copy2(src, QUERY_DIR)
        print(f"üß¨ Imported structure file from Drive: {src.name}")
        return final_ids

    # Unknown file type
    shutil.copy2(src, QUERY_DIR)
    print(f"‚ö†Ô∏è Drive file type not recognized as ID list or structure: {src.name} (copied anyway).")
    return final_ids


# --- EXECUTION ---

setup_environment()
final_ids = parse_ids(structure_ids)

# Local Uploads: ID list
if upload_id_list_txt:
    print("üì§ Upload .txt ID list:")
    for name, content in files.upload().items():
        final_ids.extend(parse_ids(content.decode("utf-8", errors="replace")))

# Local Uploads: structures
if upload_structures:
    print("üì§ Upload PDB/mmCIF structures:")
    os.chdir(QUERY_DIR)
    _ = files.upload()
    os.chdir("/content")

# Local Uploads: archive of structures
if upload_tar_gz:
    print("üì§ Upload .tar.gz archive:")
    for name, content in files.upload().items():
        tmp = Path("/content") / name
        tmp.write_bytes(content)
        extract_archive(tmp)
        tmp.unlink(missing_ok=True)

# Drive Import (auto-detect)
# Also mount drive if output folder is relative (MyDrive)
need_drive = (import_from_google_drive and bool(drive_path)) or (not name_of_output_folder.is_absolute())
if need_drive:
    drive.mount("/content/drive", force_remount=False)

if import_from_google_drive and drive_path:
    final_ids = handle_drive_import(drive_path, final_ids)

# Ensure output folder exists
OUT_DIR = ensure_output_dir(name_of_output_folder)
print(f"üìÅ Output folder ready: {OUT_DIR}")

# Force flatten one last time to be safe
flatten_directory()

# --- RESULTS SUMMARY ---

# de-duplicate IDs while preserving order
unique_ids = list(dict.fromkeys(final_ids))
structure_ids_for_download = ",".join(unique_ids)

# Identify structure files robustly (handle .pdb.gz etc.)
found_structures = [
    f for f in QUERY_DIR.glob("*")
    if f.is_file() and f.name.lower().endswith((".pdb", ".cif", ".ent", ".pdb.gz", ".cif.gz", ".ent.gz"))
]

# Optional: ID-list-like files present in the query folder (informational)
id_list_files_in_query = [
    f for f in QUERY_DIR.glob("*")
    if f.is_file() and f.name.lower().endswith((".txt", ".csv", ".tsv"))
]

print("\n" + "‚îÄ"*50)
print("üöÄ **ZincSight: Input Ready (FLATTENED)**")
print(f"‚Ä¢ Identifiers to fetch: {len(unique_ids)}")
print(f"‚Ä¢ Structure files detected: {len(found_structures)}")
if id_list_files_in_query:
    print(f"‚Ä¢ Note: ID-list-like files in query folder: {len(id_list_files_in_query)} (not treated as structures)")
print("‚Ä¢ Folder structure: All files moved to root ‚úÖ")
print(f"‚Ä¢ Output folder: {OUT_DIR}")

if found_structures:
    print(f"‚Ä¢ Sample structure file: {found_structures[0].name}")

if not unique_ids and not found_structures:
    print("\n‚ö†Ô∏è **Warning:** No data found. Check your inputs.")
print("‚îÄ"*50)

In [None]:
#@title Execute ZincSight (Auto-Batching 50k local files + Chunked ID Downloads + Skip-If-Already-Exists + Resume Log) {display-mode: "form"}
from IPython.utils.capture import capture_output
import os
import sys
import shutil
import math
import multiprocessing
import platform
import re
from pathlib import Path
from google.colab import drive
from datetime import datetime

# -------------------------
# 0) USER PARAMS (expected to exist from earlier form cell)
# -------------------------
# include_histidine_rotamers : bool
# create_pymol_sessions      : bool
# name_of_output_folder      : str
# structure_ids_for_download : str

# -------------------------
# 1) SETUP & DEPENDENCIES
# -------------------------
SETUP_MARKER = "/content/ENV_SETUP.marker"
if not os.path.exists(SETUP_MARKER):
    print("üîß Installing dependencies...")
    with capture_output():
        if not os.path.exists("/content/ZincSight"):
            !git clone https://github.com/MECHTI1/ZincSight.git
        %cd /content/ZincSight
        !pip install -r requirements.txt
        sys.path.append("/content/ZincSight")
    open(SETUP_MARKER, "w").close()

drive.mount("/content/drive")
sys.path.insert(0, "/content/ZincSight")
from main_execute import execute_zincsight

# -------------------------
# 2) CONFIGURATION
# -------------------------
ROOT_QUERY = Path("/content/query_structures")
OUTPUT_DIR = Path("/content/output")
DRIVE_DEST = Path(OUT_DIR)
LOG_FILE = DRIVE_DEST / "zincsight_batch_log.txt"

BATCH_SIZE = 50000          # MAX LOCAL FILES PER BATCH
DOWNLOAD_ID_BATCH = 50000    # MAX IDs PER DOWNLOAD BATCH

ROOT_QUERY.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DRIVE_DEST.mkdir(parents=True, exist_ok=True)

# Hardware check
physical_cores = max(1, multiprocessing.cpu_count() - 1)

def parse_structure_ids(raw: str):
    """
    Accept IDs separated by commas / spaces / newlines.
    Returns a clean list (order preserved).
    """
    if not raw:
        return []
    parts = re.split(r"[,\s]+", raw.strip())
    return [p for p in parts if p]

def existing_structure_keys(root: Path):
    """
    Build a set of 'keys' for existing structure files already present under root.
    Key = filename without common extensions (pdb/cif/mmcif/ent + optional gz).
    Example: AF-A0A...-F1-model_v4.pdb.gz -> AF-A0A...-F1-model_v4
    """
    keys = set()
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        name = p.name

        # strip gzip if present
        if name.endswith(".gz"):
            name = name[:-3]

        # strip common structure extensions
        for ext in (".pdb", ".cif", ".mmcif", ".ent"):
            if name.endswith(ext):
                name = name[: -len(ext)]
                break

        keys.add(name)
    return keys

# -------------------------
# 3) INTELLIGENT BATCH CREATION
# -------------------------
print("üßπ Organizing local files into batches...")

# Step A: Flatten everything to a temporary "staging" area
staging_dir = Path("/content/temp_staging")
if staging_dir.exists():
    shutil.rmtree(staging_dir)
staging_dir.mkdir()

# Move all existing files from ROOT_QUERY to staging (iterate on a list to avoid traversal issues)
file_count = 0
for item in list(ROOT_QUERY.rglob("*")):
    if item.is_file():
        shutil.move(str(item), str(staging_dir / item.name))
        file_count += 1

# Step B: Create Local Batches (batch_001, batch_002...)
all_files = sorted(list(staging_dir.glob("*")))  # sorting ensures consistent batches on retry
total_files = len(all_files)
num_local_batches = math.ceil(total_files / BATCH_SIZE) if total_files else 0

print(f"   ‚Ä¢ Found {total_files} local files.")
print(f"   ‚Ä¢ Creating {num_local_batches} local batch(es).")

batch_map = []

for i in range(num_local_batches):
    batch_name = f"batch_{i+1:03d}"
    batch_path = ROOT_QUERY / batch_name
    batch_path.mkdir(exist_ok=True)

    start = i * BATCH_SIZE
    end = start + BATCH_SIZE
    files_in_batch = all_files[start:end]

    for f in files_in_batch:
        shutil.move(str(f), str(batch_path / f.name))

    batch_map.append({"name": batch_name, "path": batch_path, "ids": ""})

# Step C: Create Download Batches by CHUNKING the ID list
download_ids_list = parse_structure_ids(structure_ids_for_download)

# IMPORTANT: Skip IDs that already exist locally (based on filenames we just batched)
if download_ids_list:
    existing_keys = existing_structure_keys(ROOT_QUERY)  # includes batch_### folders we just created
    filtered = [i for i in download_ids_list if i not in existing_keys]
    removed = len(download_ids_list) - len(filtered)
    if removed > 0:
        print(f"   ‚Ä¢ Skipping {removed} download ID(s) already present as local files.")
    download_ids_list = filtered

if download_ids_list:
    n_download_batches = math.ceil(len(download_ids_list) / DOWNLOAD_ID_BATCH)

    print(f"   ‚Ä¢ Splitting {len(download_ids_list)} download IDs into {n_download_batches} batch(es) "
          f"({DOWNLOAD_ID_BATCH} IDs each; last may be smaller).")

    for j in range(n_download_batches):
        batch_index = num_local_batches + 1 + j  # continue numbering after local batches

        download_batch_name = f"batch_{batch_index:03d}_downloads_{j+1:03d}"
        download_batch_path = ROOT_QUERY / download_batch_name
        download_batch_path.mkdir(exist_ok=True)

        start = j * DOWNLOAD_ID_BATCH
        end = start + DOWNLOAD_ID_BATCH
        ids_chunk = download_ids_list[start:end]
        ids_chunk_str = ",".join(ids_chunk)

        batch_map.append({"name": download_batch_name, "path": download_batch_path, "ids": ids_chunk_str})
        print(f"     - Added: {download_batch_name} ({len(ids_chunk)} IDs)")

# Cleanup staging
if staging_dir.exists():
    shutil.rmtree(staging_dir)

# -------------------------
# 4) LOG FILE FUNCTIONS
# -------------------------
def get_completed_batches():
    if not LOG_FILE.exists():
        return set()
    with open(LOG_FILE, "r") as f:
        return set(line.strip() for line in f.readlines() if line.strip())

def mark_batch_complete(b_name):
    with open(LOG_FILE, "a") as f:
        f.write(f"{b_name}\n")

# -------------------------
# 5) EXECUTION LOOP
# -------------------------
completed_batches = get_completed_batches()

print("\n" + "‚ïê"*40)
print("üöÄ Starting Execution")
print(f"   ‚Ä¢ Destination: {DRIVE_DEST}")
print(f"   ‚Ä¢ Log File: {LOG_FILE.name}")
print(f"   ‚Ä¢ Previously Completed: {len(completed_batches)}")
print("‚ïê"*40)

for batch in batch_map:
    b_name = batch["name"]
    b_path = batch["path"]
    b_ids  = batch["ids"]

    # Skip completed batches
    if b_name in completed_batches:
        print(f"\n‚è≠Ô∏è Skipping {b_name} (already completed)")
        # cleanup batch folder to save space
        if b_path.exists():
            shutil.rmtree(str(b_path))
        continue

    current_output = OUTPUT_DIR / b_name
    current_output.mkdir(parents=True, exist_ok=True)

    print(f"\n‚ñ∂Ô∏è Processing: {b_name}")
    if b_ids:
        print("   (Run ZincSight for this chunk...)")
    else:
        local_n = len(list(b_path.glob("*"))) if b_path.exists() else 0
        print(f"   (Local files: {local_n})")
    with capture_output() as captured_output:
        execute_zincsight(
            include_histidine_rotamers,
            b_ids,               # "" for local; comma-separated IDs for download chunks
            str(b_path),         # batch folder (local or download working folder)
            str(current_output), # output for this batch
            physical_cores,
            create_pymol_sessions
        )

    # Compress and Save
    archive_name = shutil.make_archive(str(current_output / b_name), "gztar", str(current_output))
    final_dest = DRIVE_DEST / f"ZincSight_{b_name}.tar.gz"
    shutil.copy2(archive_name, final_dest)

    print(f"‚úÖ Completed: {b_name}")
    print(f"   Saved to: {final_dest.name}")

    # Mark complete
    mark_batch_complete(b_name)

    # Cleanup
    if b_path.exists():
        shutil.rmtree(str(b_path))
    if current_output.exists():
        shutil.rmtree(str(current_output))

print("\n" + "‚ïê"*40)
print("üèÅ ALL JOBS DONE")