# StegaShield: Colab Main Notebook
A fast, reproducible end-to-end pipeline with speed fixes for Colab.

## 1) Mount Google Drive

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("Mounted Drive.")


Mounted at /content/drive
Mounted Drive.


## 2) Configure paths
Set your project root on Drive and where to sync locally for speed.

In [None]:

# === EDIT THIS IF NEEDED ===
DRIVE_ROOT = "/content/drive/MyDrive/project_codes/models_new"

# Derived paths
DRIVE_DATASET = f"{DRIVE_ROOT}/dataset/originals"                   # pool of natural images
DRIVE_JPEGS = f"{DRIVE_ROOT}/JpegImages"                  # generated splits (by generate_dataset.py)
LOCAL_ROOT = "/content"                                   # local workspace
LOCAL_DATASET = f"{LOCAL_ROOT}/dataset/originals"                   # local copy for speed
LOCAL_JPEGS = f"{LOCAL_ROOT}/JpegImages"                  # local generated splits

print("DRIVE_ROOT    :", DRIVE_ROOT)
print("DRIVE_DATASET :", DRIVE_DATASET)
print("DRIVE_JPEGS   :", DRIVE_JPEGS)
print("LOCAL_DATASET :", LOCAL_DATASET)
print("LOCAL_JPEGS   :", LOCAL_JPEGS)


DRIVE_ROOT    : /content/drive/MyDrive/project_codes/models_new
DRIVE_DATASET : /content/drive/MyDrive/project_codes/models_new/dataset/originals
DRIVE_JPEGS   : /content/drive/MyDrive/project_codes/models_new/JpegImages
LOCAL_DATASET : /content/dataset/originals
LOCAL_JPEGS   : /content/JpegImages


## 3) Install dependencies

In [None]:

!pip -q install timm reedsolo pywavelets scikit-image joblib tqdm matplotlib
print("✅ Dependencies installed.")


✅ Dependencies installed.


## 4) Navigate to project folder on Drive

In [None]:

%cd "{DRIVE_ROOT}"
!ls -lah


/content/drive/MyDrive/project_codes/models_new
total 127K
-rw------- 1 root root  11K Nov  8 21:00 attacker.py
-rw------- 1 root root  16K Nov  8 21:00 cnn_train.py
drwx------ 2 root root 4.0K Nov  7 20:35 dataset
-rw------- 1 root root 7.3K Nov  8 21:00 embedder.py
-rw------- 1 root root  13K Nov  9 07:12 generate_dataset.py
-rw------- 1 root root 2.3K Nov  9 07:12 hybrid_train.py
-rw------- 1 root root 9.0K Nov  8 21:00 label_checker.py
-rw------- 1 root root  16K Nov  9 08:07 main_stegashield_colab.ipynb
drwx------ 2 root root 4.0K Nov  7 20:18 older
drwx------ 2 root root 4.0K Nov  9 08:00 train_residual_6k
-rw------- 1 root root 1.2K Nov  8 21:00 utils.py
-rw------- 1 root root 3.6K Nov  8 21:00 verifier.py
-rw------- 1 root root  35K Nov  9 07:54 watermark_core.py


## 5) Sanity checks for dataset pool on Drive

In [None]:

import os, subprocess, shlex

print("Listing top-level of DRIVE_DATASET:")
!ls -lah "$DRIVE_DATASET" | head -n 40 || echo "⚠️ Could not list DRIVE_DATASET"

print("\nCounting images (jpg/jpeg/png) recursively:")
!find "$DRIVE_DATASET" -type f \
  \( -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" \) | wc -l


Listing top-level of DRIVE_DATASET:
total 3.5G
-rw------- 1 root root 143K Nov  2 17:45 2007_000027.jpg
-rw------- 1 root root  54K Nov  2 17:45 2007_000032.jpg
-rw------- 1 root root  70K Nov  2 17:45 2007_000033.jpg
-rw------- 1 root root  64K Nov  2 17:45 2007_000039.jpg
-rw------- 1 root root  81K Nov  2 17:45 2007_000042.jpg
-rw------- 1 root root  72K Nov  2 17:45 2007_000061.jpg
-rw------- 1 root root 124K Nov  2 17:45 2007_000063.jpg
-rw------- 1 root root  99K Nov  2 17:45 2007_000068.jpg
-rw------- 1 root root  79K Nov  2 17:45 2007_000121.jpg
-rw------- 1 root root  78K Nov  2 17:45 2007_000123.jpg
-rw------- 1 root root 104K Nov  2 17:45 2007_000129.jpg
-rw------- 1 root root  77K Nov  2 17:45 2007_000170.jpg
-rw------- 1 root root 131K Nov  2 17:45 2007_000175.jpg
-rw------- 1 root root  88K Nov  2 17:45 2007_000187.jpg
-rw------- 1 root root  81K Nov  2 17:45 2007_000241.jpg
-rw------- 1 root root  23K Nov  2 17:45 2007_000243.jpg
-rw------- 1 root root  68K Nov  2 17:45 

## 6) Copy dataset to local runtime for speed

In [None]:

# Copy only once per session; rsync shows progress
!rm -rf "$LOCAL_DATASET"
!mkdir -p "$LOCAL_DATASET"
!rsync -ah --info=progress2 "$DRIVE_DATASET/" "$LOCAL_DATASET/"
print("✅ Copied dataset to local.")


          3.75G 100%    7.36MB/s    0:08:05 (xfr#33260, to-chk=0/33261)
✅ Copied dataset to local.


## 7) Apply speed patches (lower image size & JPEG probability)

In [None]:

# Reduce SimpleImageFolder(image_size=256) -> 224
!sed -i 's/image_size=256/image_size=224/' watermark_core.py

# Lower non-differentiable JPEG probability p_jpeg=0.5 -> 0.2
!sed -i 's/def __init__(self, p_jpeg=0.5):/def __init__(self, p_jpeg=0.2):/' watermark_core.py

print("✅ Applied speed patches in watermark_core.py")


✅ Applied speed patches in watermark_core.py


## 8) (Optional) Cap steps per epoch for quick runs
*Set `MAX_STEPS` via environment; default 400 if not set.*

In [None]:
import os, re

with open("watermark_core.py", "r", encoding="utf-8") as f:
    src = f.read()

# Properly escaped version of the check
if 'MAX_STEPS = int(os.environ.get("MAX_STEPS"' not in src:
    src = src.replace(
        'pbar = tqdm(dl, desc=f"Epoch {epoch}/{epochs}")',
        'pbar = tqdm(dl, desc=f"Epoch {epoch}/{epochs}")\n        MAX_STEPS = int(os.environ.get("MAX_STEPS", "400"))'
    )
    src = src.replace(
        'for batch_idx, imgs in enumerate(pbar):',
        'for batch_idx, imgs in enumerate(pbar):\n            if batch_idx >= MAX_STEPS:\n                break'
    )
    with open("watermark_core.py", "w", encoding="utf-8") as f:
        f.write(src)
    print("✅ Inserted MAX_STEPS guard.")
else:
    print("ℹ️ MAX_STEPS guard already present; skipping.")


ℹ️ MAX_STEPS guard already present; skipping.


## 9) Keep AMP on legacy API to avoid device_type kwarg issues

In [None]:

# Ensure we use torch.cuda.amp (widely supported on Colab) to avoid TypeError
!sed -i 's/from torch.amp import autocast, GradScaler/from torch.cuda.amp import autocast, GradScaler/' watermark_core.py
!sed -i 's/GradScaler(device_type="cuda")/GradScaler()/g' watermark_core.py
!sed -i 's/autocast(device_type="cuda", enabled=use_amp)/autocast(enabled=use_amp)/g' watermark_core.py
print("✅ AMP configured for Colab compatibility (expect a deprecation warning; it is harmless).")




## 10) Create a smaller residual training subset (6k images)

In [None]:

!rm -rf /content/train_residual_6k
!mkdir -p /content/train_residual_6k
!find "$LOCAL_DATASET" -type f \
  \( -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" \) \  | shuf -n 6000 \  | xargs -I{{}} cp "{{}}" /content/train_residual_6k/

print("Count in /content/train_residual_6k:")
!find /content/train_residual_6k -type f \
  \( -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" \) | wc -l


find: paths must precede expression: ` '
shuf: ' ': No such file or directory
Count in /content/train_residual_6k:
0


## 11) Train the residual encoder/decoder (quick run)

In [None]:

import os
os.environ["MAX_STEPS"] = "400"   # ~cap steps/epoch
!python hybrid_train.py   --image_dir /content/train_residual_6k   --epochs 3   --batch_size 32   --num_workers 0


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100% 528M/528M [00:02<00:00, 239MB/s]
Starting U-Net training...
Source images: /content/train_residual_6k
Epochs: 3, Batch Size: 32, LR: 0.0001
Payload bits: 112
Model will be saved to: best_residual_hybrid.pt
Traceback (most recent call last):
  File "/content/drive/MyDrive/project_codes/models_new/hybrid_train.py", line 49, in <module>
    main(args)
  File "/content/drive/MyDrive/project_codes/models_new/hybrid_train.py", line 23, in main
    core.train_residual_encoder(
  File "/content/drive/MyDrive/project_codes/models_new/watermark_core.py", line 636, in train_residual_encoder
    dl = DataLoader(
         ^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 388, in __init__
    sampler = RandomSampler(dataset, generator=generator)  # type: ignore[arg-type]
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

## 12) Configure dataset generation to 18k (balanced 3-way splits)

In [None]:

# Edit generate_dataset.py counts to:
# train: 3k each; val: 1.5k each; test: 1.5k each  -> total 18k
import io, re

with open("generate_dataset.py", "r", encoding="utf-8") as f:
    src = f.read()

# Replace the 'per_split' dict robustly
new_block = """    'per_split': {
        'train': {'watermarked': 3000, 'tampered': 3000, 'unwatermarked': 3000},
        'val':   {'watermarked': 1500, 'tampered': 1500, 'unwatermarked': 1500},
        'test':  {'watermarked': 1500, 'tampered': 1500, 'unwatermarked': 1500}
    },"""

src = re.sub(r"'per_split':\s*\{[\s\S]*?\},", new_block, src, count=1)

with open("generate_dataset.py", "w", encoding="utf-8") as f:
    f.write(src)

print("✅ Updated CONFIG['per_split'] to balanced 18k.")


✅ Updated CONFIG['per_split'] to balanced 18k.


## 13) Generate dataset (locally)

In [None]:

# Ensure originals_dir points to local dataset if you want local generation speed
# Otherwise leave as-is to use DRIVE_DATASET.
# Here we run with defaults; generate_dataset.py reads from CONFIG.
!python generate_dataset.py --jobs 2
!ls -lah JpegImages | head -n 40


  File "/content/drive/MyDrive/project_codes/models_new/generate_dataset.py", line 46
    'attack_presets': [
IndentationError: unexpected indent
ls: cannot access 'JpegImages': No such file or directory


## 14) (Optional) Mirror JpegImages to local for training speed

In [None]:

# If JpegImages were created under DRIVE_ROOT, sync them locally for speed.
if os.path.exists("JpegImages"):
    # JpegImages exists in current directory; copy to LOCAL_JPEGS
    !rm -rf "$LOCAL_JPEGS"
    !mkdir -p "$LOCAL_JPEGS"
    !rsync -ah --info=progress2 "JpegImages/" "$LOCAL_JPEGS/"
else:
    # Fall back to Drive path
    !rm -rf "$LOCAL_JPEGS"
    !mkdir -p "$LOCAL_JPEGS"
    !rsync -ah --info=progress2 "$DRIVE_JPEGS/" "$LOCAL_JPEGS/"
print("✅ Local JpegImages mirror ready:", os.path.exists(LOCAL_JPEGS))


rsync: [sender] change_dir "/content/drive/MyDrive/project_codes/models_new/JpegImages" failed: No such file or directory (2)
              0 100%    0.00kB/s    0:00:00 (xfr#0, to-chk=0/0)
rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1338) [sender=3.2.7]
✅ Local JpegImages mirror ready: True


## 15) Run label checker (auto-fix embedding issues)

In [None]:

!python label_checker.py
!ls -lah JpegImages/problematic | head -n 40 || echo "No problematic files folder."


Traceback (most recent call last):
  File "/content/drive/MyDrive/project_codes/models_new/label_checker.py", line 5, in <module>
    from verifier import extract_and_verify
  File "/content/drive/MyDrive/project_codes/models_new/verifier.py", line 14, in <module>
    from embedder import REGISTRY, bytes_to_tensor
  File "/content/drive/MyDrive/project_codes/models_new/embedder.py", line 129, in <module>
    from verifier import external_extract
ImportError: cannot import name 'external_extract' from partially initialized module 'verifier' (most likely due to a circular import) (/content/drive/MyDrive/project_codes/models_new/verifier.py)
ls: cannot access 'JpegImages/problematic': No such file or directory


## 16) Train the CNN classifier (Xception + aux features)

In [None]:

!python cnn_train.py   --metadata JpegImages/metadata.csv   --epochs 5   --batch_size 32   --num_workers 2


Traceback (most recent call last):
  File "/content/drive/MyDrive/project_codes/models_new/cnn_train.py", line 369, in <module>
    train(
  File "/content/drive/MyDrive/project_codes/models_new/cnn_train.py", line 170, in train
    full_ds_train = StegaDataset(metadata_csv, transform=train_transform, use_aux=use_aux, is_training=True)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/drive/MyDrive/project_codes/models_new/cnn_train.py", line 27, in __init__
    self.df = pd.read_csv(metadata_csv).fillna(0) # Fill NaNs with 0
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
          

## 17) Quick evaluation (example extract & verify)

In [None]:

from verifier import extract_and_verify
from pathlib import Path

# Try to pick one sample from test/tampered if available
cand = None
test_tamp = Path("JpegImages/test/tampered")
if test_tamp.exists():
    for p in test_tamp.glob("*"):
        if p.suffix.lower() in [".jpg",".jpeg",".png"]:
            cand = str(p)
            break

if cand:
    # Heuristic way to infer original path (your pipeline stores it in metadata too)
    stem = Path(cand).stem
    # This is an example — adjust if your naming differs
    original_guess = next(Path("dataset/originals").glob(f"{stem.split('_')[0]}*"), None)
    print("Sample Tampered:", cand)
    print("Original Guess :", str(original_guess) if original_guess else "not found")
    if original_guess:
        res = extract_and_verify(cand, str(original_guess), params={'payload_bytes': b'StegaShield_v1', 'digest_bits': 128})
        print(res)
else:
    print("No sample found in JpegImages/test/tampered to demo extract_and_verify.")


ImportError: cannot import name 'external_extract' from partially initialized module 'verifier' (most likely due to a circular import) (/content/drive/MyDrive/project_codes/models_new/verifier.py)

## 18) (Optional) Save key artifacts back to Drive

In [None]:

# Save trained models/checkpoints back to Drive for persistence
!mkdir -p "$DRIVE_ROOT/checkpoints"
!cp -f best_residual_hybrid.pt "$DRIVE_ROOT/checkpoints/" 2>/dev/null || true
!cp -f stegashield_cnn_final.pth "$DRIVE_ROOT/checkpoints/" 2>/dev/null || true
print("✅ Saved checkpoints to", f"{DRIVE_ROOT}/checkpoints")


## 19) Session info

In [None]:

!nvidia-smi
import torch, platform
print("Torch:", torch.__version__)
print("Python:", platform.python_version())
