<a href="https://colab.research.google.com/github/ItsYaMushShroomSC/Booger-Invasion/blob/master/PTM_Mamba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In content/drive/MyDrive/ptm-mamba-notbebook, that's the codebase to run

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
os.environ["TORCH_HOME"] = "/content/drive/MyDrive/.cache/torch"
os.makedirs(os.path.join(os.environ["TORCH_HOME"], "hub/checkpoints"), exist_ok=True)
print("Drive mounted and TORCH_HOME =", os.environ["TORCH_HOME"])


Mounted at /content/drive
Drive mounted and TORCH_HOME = /content/drive/MyDrive/.cache/torch


In [None]:
import torch
print("torch:", torch.__version__)
print("torch.cuda.is_available():", torch.cuda.is_available())
print("cuda version reported by torch:", torch.version.cuda)


torch: 2.5.1+cu121
torch.cuda.is_available(): True
cuda version reported by torch: 12.1


In [None]:
!nvidia-smi

Wed Nov 12 16:15:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# Guided installer for Colab (mount -> align torch -> native packages -> deps -> repo -> rust)
import os, json, subprocess, shlex, sys, time
from pathlib import Path

DRIVE_ROOT = "/content/drive/MyDrive"
STATE_PATH = os.path.join(DRIVE_ROOT, "ptm_install_state.json")
TORCH_HOME = os.path.join(DRIVE_ROOT, ".cache", "torch")

# Utility helpers
def run(cmd, check=True, shell=True, env=None):
    print("\n$ " + cmd)
    rc = subprocess.run(cmd, shell=shell, check=False, env=env)
    if check and rc.returncode != 0:
        raise RuntimeError(f"Command failed (exit {rc.returncode}): {cmd}")
    return rc.returncode

def save_state(state):
    try:
        with open(STATE_PATH, "w") as f:
            json.dump(state, f)
    except Exception as e:
        print("Warning: failed to write state:", e)

def load_state():
    if os.path.exists(STATE_PATH):
        try:
            with open(STATE_PATH) as f:
                return json.load(f)
        except:
            return {}
    return {}

# 1) Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.makedirs(TORCH_HOME + "/hub/checkpoints", exist_ok=True)
os.environ["TORCH_HOME"] = TORCH_HOME
print("TORCH_HOME ->", os.environ["TORCH_HOME"])

state = load_state()
state.setdefault("steps_done", {})

# Helper to ask yes/no
def ask_yes(prompt, default="n"):
    resp = input(prompt + (" [y/N]: " if default.lower()=="n" else " [Y/n]: "))
    if not resp:
        return default.lower() == "y"
    return resp.strip().lower() in ("y","yes")

# 2) Check torch/cuda and optionally align to cu121
print("\n=== PyTorch / CUDA status ===")
try:
    import torch
    print("torch:", torch.__version__, "| torch.version.cuda:", torch.version.cuda, "| cuda available:", torch.cuda.is_available())
except Exception as e:
    print("PyTorch import failed:", e)
    print("We will attempt to (re)install PyTorch for colab cu121 if you agree.")

torch_ok = False
try:
    import torch
    torch_ok = (torch.version.cuda is not None and "121" in (torch.version.cuda or "") and torch.cuda.is_available())
except Exception:
    torch_ok = False

if not torch_ok:
    print("\nTorch is not aligned to cu121 or CUDA not available.")
    if ask_yes("Do you want to install PyTorch cu121 wheels now? (recommended for Colab)"):
        print("Installing PyTorch cu121 wheels. This may take a few minutes.")
        # Update pip/build tools
        run("pip install -U pip setuptools wheel ninja packaging jedi")
        # Uninstall existing torch to avoid conflicts
        run("pip uninstall -y torch torchvision torchaudio || true")
        # Install cu121 wheels
        run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio")
        print("\n=== DONE installing torch. You MUST restart the runtime now. ===")
        print("After restarting, re-run this same cell. The installer will continue from the next step.")
        save_state(state)
        raise SystemExit("Restart your Colab runtime and re-run this cell.")
    else:
        print("Skipping torch alignment. You may encounter build/compilation during subsequent installs.")
else:
    print("Torch looks good for cu121 and CUDA is available. Proceeding.")

# Refresh torch info after potential change
try:
    import importlib
    importlib.reload(torch)
    print("Current torch:", torch.__version__, "| torch.version.cuda:", torch.version.cuda, "| cuda available:", torch.cuda.is_available())
except Exception:
    pass

# 3) Install native-extension packages (causal-conv1d, mamba-ssm)
def ensure_importable(module_name):
    try:
        __import__(module_name)
        print(f"{module_name} is importable; skipping install.")
        return True
    except Exception as e:
        print(f"{module_name} not importable: {e}")
        return False

native_pkgs = [
    ("causal_conv1d", "causal-conv1d>=1.3.0", "causal_conv1d"),
    ("mamba_ssm", "mamba-ssm>=2.1.0", "mamba_ssm")
]

for modname, pkg_spec, checkname in native_pkgs:
    if state["steps_done"].get(modname):
        print(f"Skipping {pkg_spec} (already recorded).")
        continue
    if ensure_importable(checkname):
        state["steps_done"][modname] = True
        save_state(state)
        continue
    print(f"\nAbout to install native package: {pkg_spec}")
    if not ask_yes(f"Install {pkg_spec} now?"):
        print(f"Skipping {pkg_spec} by user request. You can install it later with `pip install \"{pkg_spec}\"`.")
        continue
    try:
        run(f"pip install \"{pkg_spec}\"")
    except Exception as e:
        print("Installation failed. Try re-aligning torch to cu121 and re-running. Error:", e)
        raise
    # Try import
    if ensure_importable(checkname):
        state["steps_done"][modname] = True
        save_state(state)
    else:
        print(f"Warning: {checkname} still not importable after install. You may need to restart the runtime and re-run the cell.")
        # recommend restart
        if ask_yes("Restart runtime now and re-run this cell?"):
            save_state(state)
            raise SystemExit("Please restart the runtime and re-run this cell.")

# 4) Install other python deps
other_deps = [
    "hydra-core", "omegaconf", "fair-esm", "transformers", "datasets",
    "accelerate", "evaluate", "pytest", "biopython", "deepspeed", "wandb"
]
if not state["steps_done"].get("other_deps"):
    print("\nAbout to install other Python deps:", ", ".join(other_deps))
    if ask_yes("Install other deps now? (This may take a few minutes)"):
        try:
            run("pip install " + " ".join(other_deps))
            state["steps_done"]["other_deps"] = True
            save_state(state)
        except Exception as e:
            print("Failed installing other deps:", e)
            raise
    else:
        print("Skipping other deps for now.")
else:
    print("Other deps already installed (per state file).")

# 5) PyTorch Geometric (optional)
if not state["steps_done"].get("pyg"):
    print("\nPyTorch Geometric (pyg) installation is optional and must match your torch+cuda.")
    if ask_yes("Attempt to install torch_geometric and common wheels (recommended only if torch is cu121)?"):
        # try auto-install; user may need to adjust if their torch version differs
        try:
            # try an automatic install; if fails, user will see pip output and can ask for help
            run("pip install torch_geometric -q || true")  # allow failure gracefully
            # attempt the dependency wheels too (this may or may not be necessary)
            # We won't force the wheel link so failures are visible.
            run("pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu121.html -q || true")
            print("Attempted pyg installs; check output above for success/failure.")
            state["steps_done"]["pyg"] = True
            save_state(state)
        except Exception as e:
            print("PyG install attempt failed or partial; please install manually with the wheel that matches your torch version.")
    else:
        print("Skipping PyG installation for now.")
else:
    print("PyG step already attempted (per state).")

# 6) Install repo in editable mode
# detect repo root
candidates = [
    "/content/ptm-mamba",
    "/content/drive/MyDrive/ptm-mamba-code",
    "/content/drive/MyDrive/ptm-mamba"
]
repo_root = None
for c in candidates:
    if os.path.exists(os.path.join(c, "setup.py")) or os.path.exists(os.path.join(c, "pyproject.toml")):
        repo_root = c
        break

if repo_root is None:
    print("\nCould not automatically find your repo root. Please enter the full path to the repo root (where setup.py or pyproject.toml is):")
    repo_root = input("Repo root path: ").strip()
    if not os.path.exists(repo_root):
        raise FileNotFoundError(f"Repo root not found: {repo_root}")

print("\nRepo root detected:", repo_root)
if not state["steps_done"].get("repo_installed"):
    if ask_yes("Install the repo in editable mode now (pip install -e .)?"):
        run(f"cd {shlex.quote(repo_root)} && pip install -e .")
        state["steps_done"]["repo_installed"] = True
        save_state(state)
        print("Repo installed; you may want to restart the runtime to ensure native extensions load cleanly.")
        if ask_yes("Restart runtime now? (recommended)"):
            print("Please restart the runtime and re-run this cell to continue any remaining steps.")
            raise SystemExit("Restart runtime and re-run the cell.")
    else:
        print("Skipping repo install for now.")
else:
    print("Repo already installed per state file.")

# 7) Rust + tokenizer (optional)
if not state["steps_done"].get("rust_trie"):
    print("\nRust-backed tokenizer (rust_trie) installation is optional. Requires rust toolchain.")
    if ask_yes("Install rust toolchain and rust_trie extension now?"):
        # install rust non-interactively
        try:
            run("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y")
            # source cargo env for this session
            cargo_env = os.path.expanduser("~/.cargo/env")
            if os.path.exists(cargo_env):
                run(f"source {shlex.quote(cargo_env)} && echo 'cargo env loaded'")
            else:
                print("Could not find ~/.cargo/env; you may need to restart and source it manually.")
            # install rust_trie from repo if present
            rt_path = os.path.join(repo_root, "protein_lm", "tokenizer", "rust_trie")
            if os.path.exists(rt_path):
                run(f"pip install -e {shlex.quote(rt_path)}")
                state["steps_done"]["rust_trie"] = True
                save_state(state)
            else:
                print("rust_trie not found at:", rt_path)
                print("Install manually after ensuring Rust is available.")
        except Exception as e:
            print("Rust or rust_trie installation failed:", e)
            raise
    else:
        print("Skipping rust_trie installation.")
else:
    print("Rust_trie already installed per state file.")

# Final summary
print("\n=== Installer finished current run ===")
print("Steps recorded in state file:", json.dumps(state.get("steps_done", {}), indent=2))
print(f"If the script asked you to restart the runtime, please do so now and re-run this cell. Otherwise, you should be ready to run the smoke test.")
print("To run a smoke test, remount Drive (if needed) and run your small training command (e.g., `python -m protein_lm.modeling.scripts.train +train=base dataset.subsample_size=20 ...`).")


Mounted at /content/drive
TORCH_HOME -> /content/drive/MyDrive/.cache/torch

=== PyTorch / CUDA status ===
PyTorch import failed: cannot import name 'profiler_allow_cudagraph_cupti_lazy_reinit_cuda12' from 'torch._utils_internal' (/usr/local/lib/python3.12/dist-packages/torch/_utils_internal.py)
We will attempt to (re)install PyTorch for colab cu121 if you agree.

Torch is not aligned to cu121 or CUDA not available.
Do you want to install PyTorch cu121 wheels now? (recommended for Colab) [y/N]: n
Skipping torch alignment. You may encounter build/compilation during subsequent installs.
causal_conv1d not importable: No module named 'causal_conv1d'

About to install native package: causal-conv1d>=1.3.0
Install causal-conv1d>=1.3.0 now? [y/N]: n
Skipping causal-conv1d>=1.3.0 by user request. You can install it later with `pip install "causal-conv1d>=1.3.0"`.
mamba_ssm not importable: No module named 'mamba_ssm'

About to install native package: mamba-ssm>=2.1.0
Install mamba-ssm>=2.1.0 now

RuntimeError: Command failed (exit 127): source /root/.cargo/env && echo 'cargo env loaded'

In [None]:
!ls -lah /content/drive/MyDrive/ptm-mamba-code
!ls -lah /content/drive/MyDrive/ptm-mamba-code/protein_lm/dataset | head -n 50

total 25K
-rwx------ 1 root root 1.1K Nov 12 16:17 Dockerfile
drwx------ 8 root root 4.0K Nov 12 16:17 .git
-rw------- 1 root root 3.5K Nov 12 16:17 .gitignore
-rwx------ 1 root root  551 Nov 12 16:17 LICENSE.md
drwx------ 8 root root 4.0K Nov 12 16:28 protein_lm
drwx------ 2 root root 4.0K Nov 12 16:17 ptm_data_preprocessing
-rwx------ 1 root root 5.8K Nov 12 16:17 README.md
-rwx------ 1 root root  110 Nov 12 16:17 setup.py
total 3.8M
drwx------ 2 root root 4.0K Nov 12 16:30 .ipynb_checkpoints
-rw------- 1 root root 3.8M Nov 12 16:30 ptm_labels.csv


In [None]:
# update installers
!pip install -q -U pip setuptools wheel ninja packaging jedi

# uninstall any mismatched torch installs first
!pip uninstall -y torch torchvision torchaudio

# install PyTorch builds for CUDA 12.1 (colab default)
!pip install -q --upgrade --index-url https://download.pytorch.org/whl/cu121 \
  torch torchvision torchaudio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.8.5 requires torch<2.10,>=1.10, which is not installed.[0m[31m
[0mFound existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
[0mFound existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled t

In [None]:
# Install native-extension wheels that usually cause trouble
!pip install -q "causal-conv1d>=1.3.0"
!pip install -q "mamba-ssm>=2.1.0"

# Install other python deps
!pip install -q hydra-core omegaconf fair-esm

# Finally install your repo in editable mode
!pip install -q -e .


[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for causal-conv1d (pyproject.toml) ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0m  Installing build dependencies ... [?25l[?25hcanceled[31mERROR: Operation cancelled by user[0m[31m
[0m[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
# Single-cell safe copy from Drive -> /content with dry-run + confirmation
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, shlex, subprocess, sys
src = "/content/drive/MyDrive/ptm-mamba-code/"   # <-- source (trailing slash => copy contents)
dest = "/content/ptm-mamba"                      # <-- destination directory (will be created)

# sanity checks
print("Source exists:", os.path.exists(src))
if not os.path.exists(src):
    print("ERROR: Source path not found:", src)
    raise SystemExit(1)

# Dry-run: show what would be copied (safe)
print("\n=== DRY RUN: what would be copied ===")
cmd_dry = f"rsync -av --dry-run --exclude='.git' {shlex.quote(src)} {shlex.quote(dest)}"
print("$", cmd_dry)
subprocess.run(cmd_dry, shell=True, check=True)

# Ask for confirmation to proceed
proceed = input("\nProceed with actual copy? Type 'y' to continue: ").strip().lower()
if proceed != 'y':
    print("Aborted by user.")
    raise SystemExit(0)

# Make sure destination exists, then do the real copy
os.makedirs(dest, exist_ok=True)
cmd_real = f"rsync -av --exclude='.git' {shlex.quote(src)} {shlex.quote(dest)}"
print("\n=== RUNNING: copying now ===")
print("$", cmd_real)
subprocess.run(cmd_real, shell=True, check=True)

# Change notebook working directory into the destination repo (so %cd persists)
get_ipython().run_line_magic('cd', dest)

# List files and show size
print("\n=== Destination contents (top lines) ===")
get_ipython().run_cell_magic('bash', '', 'ls -lah | head -n 50')
print("\n=== Destination size ===")
get_ipython().run_cell_magic('bash', '', f"du -sh {shlex.quote(dest)}")


Mounted at /content/drive
Source exists: True

=== DRY RUN: what would be copied ===
$ rsync -av --dry-run --exclude='.git' /content/drive/MyDrive/ptm-mamba-code/ /content/ptm-mamba

Proceed with actual copy? Type 'y' to continue: y

=== RUNNING: copying now ===
$ rsync -av --exclude='.git' /content/drive/MyDrive/ptm-mamba-code/ /content/ptm-mamba
/content/ptm-mamba

=== Destination contents (top lines) ===
total 40K
drwx------ 4 root root 4.0K Nov 12 16:25 .
drwxr-xr-x 1 root root 4.0K Nov 12 16:41 ..
-rwx------ 1 root root 1.1K Nov 12 16:17 Dockerfile
-rw------- 1 root root 3.5K Nov 12 16:17 .gitignore
-rwx------ 1 root root  551 Nov 12 16:17 LICENSE.md
drwx------ 8 root root 4.0K Nov 12 16:28 protein_lm
drwx------ 2 root root 4.0K Nov 12 16:17 ptm_data_preprocessing
-rwx------ 1 root root 5.8K Nov 12 16:17 README.md
-rwx------ 1 root root  110 Nov 12 16:17 setup.py

=== Destination size ===
5.6M	/content/ptm-mamba


In [None]:
# Safe sync: VM -> Drive with dry-run + optional --delete
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, shlex, subprocess, sys

# Defaults (adjust if your paths differ)
default_vm = os.getcwd()                                # current notebook working dir (VM copy)
default_drive = "/content/drive/MyDrive/ptm-mamba-code" # your Drive repo path (change if different)

print(f"Default VM source directory: {default_vm}")
print(f"Default Drive destination directory: {default_drive}")
print("\nIf you want to sync a different folder, type the full path when prompted.\n")

# Ask user for paths (enter to accept default)
src = input(f"Source (VM) directory to sync (contents) [ENTER for {default_vm}]: ").strip() or default_vm
dest = input(f"Destination on Drive to update [ENTER for {default_drive}]: ").strip() or default_drive

# Normalize paths (ensure trailing slash for copying contents)
src = src.rstrip('/') + '/'
dest = dest.rstrip('/') + '/'

# Ensure destination exists (create on Drive if needed)
if not os.path.exists(dest):
    print(f"Destination {dest} does not exist. Creating it on Drive...")
    try:
        os.makedirs(dest, exist_ok=True)
    except Exception as e:
        print("Failed to create destination. Aborting.\nError:", e)
        raise SystemExit(1)

# Ask whether to mirror (delete files on Drive not present in VM)
mirror_input = input("Do you want to MIRROR the destination (delete files in Drive not in VM)? Type 'y' to enable: ").strip().lower()
do_delete = (mirror_input == 'y')
delete_flag = '--delete' if do_delete else ''

# Dry-run
print("\n=== DRY-RUN: showing what would be copied/changed on Drive ===")
dry_cmd = f"rsync -av --dry-run --exclude='.git' {delete_flag} {shlex.quote(src)} {shlex.quote(dest)}"
print("$", dry_cmd, "\n")
subprocess.run(dry_cmd, shell=True, check=True)

proceed = input("\nDry-run shown above. Proceed with the ACTUAL sync to Drive? Type 'y' to continue: ").strip().lower()
if proceed != 'y':
    print("Aborted by user. No changes made.")
    raise SystemExit(0)

# Actual rsync
real_cmd = f"rsync -av {delete_flag} --exclude='.git' {shlex.quote(src)} {shlex.quote(dest)}"
print("\n=== RUNNING: syncing now ===")
print("$", real_cmd, "\n")
subprocess.run(real_cmd, shell=True, check=True)

# Summary listing
print("\n=== Sync complete. Destination preview ===")
subprocess.run(f"ls -lah {shlex.quote(dest)} | head -n 50", shell=True, check=True)
print("\nYou can check the full results in Google Drive at:", dest)


Mounted at /content/drive
Default VM source directory: /content/ptm-mamba
Default Drive destination directory: /content/drive/MyDrive/ptm-mamba-code

If you want to sync a different folder, type the full path when prompted.

Source (VM) directory to sync (contents) [ENTER for /content/ptm-mamba]: 
Destination on Drive to update [ENTER for /content/drive/MyDrive/ptm-mamba-code]: 
Do you want to MIRROR the destination (delete files in Drive not in VM)? Type 'y' to enable: y

=== DRY-RUN: showing what would be copied/changed on Drive ===
$ rsync -av --dry-run --exclude='.git' --delete /content/ptm-mamba/ /content/drive/MyDrive/ptm-mamba-code/ 


Dry-run shown above. Proceed with the ACTUAL sync to Drive? Type 'y' to continue: y

=== RUNNING: syncing now ===
$ rsync -av --delete --exclude='.git' /content/ptm-mamba/ /content/drive/MyDrive/ptm-mamba-code/ 


=== Sync complete. Destination preview ===

You can check the full results in Google Drive at: /content/drive/MyDrive/ptm-mamba-code/
