# üß¨ Geometry-Complete Equivariant Diffusion Model
## De Novo Drug Design Training Notebook

**Requirements:** GPU Runtime (T4), ~50GB disk for real data

## Cell 1: Check GPU

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU! Runtime > Change runtime type > GPU")

## Cell 2: Install Dependencies

In [None]:
!pip install -q torch-geometric rdkit scipy numpy pyyaml tqdm wandb
from rdkit import Chem
print("‚úÖ All dependencies installed")

## Cell 3: Clone Repository

In [None]:
!git clone https://github.com/Nethrananda21/geom_diffusion.git
%cd geom_diffusion
!git pull origin master

## Cell 4: Download CrossDocked2020 Dataset (~50GB)
‚ö†Ô∏è Skip this cell to use synthetic data for quick testing

In [None]:
# Download CrossDocked2020 (takes ~30 min)
!wget --no-check-certificate -q --show-progress http://bits.csb.pitt.edu/files/crossdock2020/CrossDocked2020_v1.3.tgz

# Extract
!mkdir -p data
!tar -xzf CrossDocked2020_v1.3.tgz -C ./data/
print("‚úÖ Dataset extracted")

# Clean up archive to save space
!rm CrossDocked2020_v1.3.tgz

## Cell 5: Preprocess Dataset

In [None]:
# Preprocess: creates train_data.pkl and val_data.pkl
!python preprocess_crossdocked.py \
    --data_dir ./data/CrossDocked2020 \
    --output_dir ./data/crossdocked \
    --config configs/debug_t4.yaml

print("\n‚úÖ Preprocessing complete")
!ls -la data/crossdocked/

## Cell 6: ‚ö†Ô∏è DELETE OLD CACHE (IMPORTANT!)

In [None]:
# CRITICAL: Delete old synthetic cache to use real data
import shutil
from pathlib import Path

cache_dir = Path('data/cache')
if cache_dir.exists():
    shutil.rmtree(cache_dir)
    print("üóëÔ∏è Deleted old cache")
else:
    print("‚ÑπÔ∏è No cache to delete")

# Verify real data exists
train_pkl = Path('data/crossdocked/train_data.pkl')
val_pkl = Path('data/crossdocked/val_data.pkl')

if train_pkl.exists() and val_pkl.exists():
    print(f"‚úÖ Real data ready:")
    print(f"   - {train_pkl} ({train_pkl.stat().st_size / 1e6:.1f} MB)")
    print(f"   - {val_pkl} ({val_pkl.stat().st_size / 1e6:.1f} MB)")
else:
    print("‚ö†Ô∏è Real data NOT found - will use synthetic")

## Cell 7: Configure Training

In [None]:
import yaml

with open('configs/debug_t4.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Adjust for Colab
config['training']['max_epochs'] = 10  # Quick test
config['hardware']['num_workers'] = 2  # Colab limit

with open('configs/debug_t4.yaml', 'w') as f:
    yaml.dump(config, f)

print(f"‚úÖ Config: batch_size={config['training']['batch_size']}, epochs={config['training']['max_epochs']}")

## Cell 8: Start Training üöÄ

In [None]:
# Train the model
# Watch the logs - should say "Loading preprocessed data" NOT "SYNTHETIC"
!python train.py --config configs/debug_t4.yaml --checkpoint_dir ./checkpoints

## Cell 9: Resume Training (If Interrupted)

In [None]:
# Uncomment to resume from checkpoint
# !python train.py --config configs/debug_t4.yaml --resume ./checkpoints/best_model.pt

## Cell 10: Download Trained Model

In [None]:
from google.colab import files
from pathlib import Path

ckpt = Path('checkpoints/best_model.pt')
if ckpt.exists():
    files.download(str(ckpt))
    print("‚úÖ Model downloaded")
else:
    print("‚ùå No checkpoint yet")