# üß¨ Geometry-Complete Equivariant Diffusion Model
## De Novo Drug Design Training Notebook

**Requirements:** GPU Runtime (T4), ~55GB disk space

## Cell 1: Check GPU & Disk Space

In [None]:
import torch
import shutil

# GPU check
print(f"PyTorch: {torch.__version__}")
if torch.cuda.is_available():
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ùå No GPU! Runtime > Change runtime type > GPU")

# Disk check
total, used, free = shutil.disk_usage('/')
print(f"\nüíæ Disk: {free / 1e9:.1f} GB free of {total / 1e9:.1f} GB")
if free / 1e9 < 55:
    print("‚ö†Ô∏è Less than 55GB free - may not fit full dataset")

## Cell 2: Install Dependencies

In [None]:
!pip install -q torch-geometric rdkit scipy numpy pyyaml tqdm wandb
from rdkit import Chem
print("‚úÖ All dependencies installed")

## Cell 3: Clone Repository

In [None]:
!git clone https://github.com/Nethrananda21/geom_diffusion.git
%cd geom_diffusion
!git pull origin master

## Cell 4: Download & Extract Dataset (Stream Mode - Saves 50GB!)

‚ö†Ô∏è This downloads and extracts simultaneously, never storing the full 50GB archive.

**Skip this cell to use synthetic data for quick testing.**

In [None]:
import os

# Create data directory
os.makedirs('data', exist_ok=True)

print("üì• Downloading and extracting CrossDocked2020 (streaming mode)...")
print("   This takes 30-60 minutes. Do NOT close the browser.")

# Stream download + extract (no intermediate .tgz file saved)
!wget -O - --no-check-certificate http://bits.csb.pitt.edu/files/crossdock2020/CrossDocked2020_v1.3.tgz 2>/dev/null | tar -xzf - -C ./data/

# Verify extraction
if os.path.exists('data/CrossDocked2020'):
    print("\n‚úÖ Dataset extracted successfully to data/CrossDocked2020/")
    !du -sh data/CrossDocked2020
else:
    # Check if extracted with different folder name
    !ls -la data/
    print("\n‚ö†Ô∏è Check folder name above - may need to rename")

## Cell 5: Preprocess Dataset (Creates .pkl files)

In [None]:
# Find the extracted folder (might have different name)
import os
from pathlib import Path

data_dir = None
for folder in ['CrossDocked2020', 'crossdocked2020', 'CrossDocked2020_v1.3']:
    if Path(f'data/{folder}').exists():
        data_dir = f'data/{folder}'
        break

if data_dir:
    print(f"üìÇ Found dataset at: {data_dir}")
    print("‚è≥ Preprocessing (this takes 10-20 minutes)...")
    !python preprocess_crossdocked.py \
        --data_dir {data_dir} \
        --output_dir ./data/crossdocked \
        --config configs/debug_t4.yaml
    print("\n‚úÖ Preprocessing complete")
    !ls -la data/crossdocked/
else:
    print("‚ùå Dataset folder not found. Check Cell 4 output.")
    !ls -la data/

## Cell 6: Delete Old Cache (CRITICAL!)

In [None]:
import shutil
from pathlib import Path

# Delete synthetic cache
cache_dir = Path('data/cache')
if cache_dir.exists():
    shutil.rmtree(cache_dir)
    print("üóëÔ∏è Deleted old cache")
else:
    print("‚ÑπÔ∏è No cache to delete")

# Verify real data
train_pkl = Path('data/crossdocked/train_data.pkl')
val_pkl = Path('data/crossdocked/val_data.pkl')

if train_pkl.exists() and val_pkl.exists():
    print(f"\n‚úÖ Ready to train on REAL data:")
    print(f"   - {train_pkl} ({train_pkl.stat().st_size / 1e6:.1f} MB)")
    print(f"   - {val_pkl} ({val_pkl.stat().st_size / 1e6:.1f} MB)")
else:
    print("\n‚ö†Ô∏è Real data NOT found - will use SYNTHETIC")

## Cell 7: Configure Training

In [None]:
import yaml

with open('configs/debug_t4.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Colab optimizations
config['training']['max_epochs'] = 10  # Quick test
config['hardware']['num_workers'] = 2  # Colab CPU limit

with open('configs/debug_t4.yaml', 'w') as f:
    yaml.dump(config, f)

print(f"‚úÖ Config: batch={config['training']['batch_size']}, epochs={config['training']['max_epochs']}")

## Cell 8: Start Training üöÄ

In [None]:
# Watch logs - should say "Loading preprocessed data" NOT "SYNTHETIC"
!python train.py --config configs/debug_t4.yaml --checkpoint_dir ./checkpoints

## Cell 9: Resume Training (If Interrupted)

In [None]:
# Uncomment to resume
# !python train.py --config configs/debug_t4.yaml --resume ./checkpoints/best_model.pt

## Cell 10: Download Trained Model

In [None]:
from google.colab import files
from pathlib import Path

ckpt = Path('checkpoints/best_model.pt')
if ckpt.exists():
    files.download(str(ckpt))
    print("‚úÖ Model downloaded")
else:
    print("‚ùå No checkpoint yet")