# üß¨ Geometry-Complete Equivariant Diffusion Model
## De Novo Drug Design Training Notebook (Google Drive Version)

**This version stores data on Google Drive to save Colab disk space.**

## Cell 1: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create folder on Drive
import os
DRIVE_DATA = '/content/drive/MyDrive/geom_diffusion_data'
os.makedirs(DRIVE_DATA, exist_ok=True)
print(f'‚úÖ Drive mounted. Data will be stored at: {DRIVE_DATA}')

## Cell 2: Check GPU & Install Dependencies

In [None]:
import torch
print(f'PyTorch: {torch.__version__}')
if torch.cuda.is_available():
    print(f'‚úÖ GPU: {torch.cuda.get_device_name(0)}')
else:
    print('‚ùå No GPU!')

!pip install -q torch-geometric rdkit scipy numpy pyyaml tqdm wandb
print('‚úÖ Dependencies installed')

## Cell 3: Clone Repository

In [None]:
!git clone https://github.com/Nethrananda21/geom_diffusion.git
%cd geom_diffusion
!git pull origin master

## Cell 4: Download Dataset to Google Drive

‚ö†Ô∏è Downloads ~50GB to your Google Drive (one-time). Skip if already downloaded.

In [None]:
import os

DRIVE_DATA = '/content/drive/MyDrive/geom_diffusion_data'
RAW_DATA = f'{DRIVE_DATA}/CrossDocked2020'

if os.path.exists(RAW_DATA):
    print(f'‚úÖ Dataset already exists at {RAW_DATA}')
    !du -sh {RAW_DATA}
else:
    print('üì• Downloading CrossDocked2020 to Google Drive...')
    print('   This takes 30-60 minutes. Do NOT close browser!')
    !curl -L --progress-bar http://bits.csb.pitt.edu/files/crossdock2020/CrossDocked2020_v1.3.tgz | tar -xzf - -C {DRIVE_DATA}/
    print('\n‚úÖ Download complete!')

## Cell 5: Create Symlink (Link Drive data to local)

In [None]:
import os

DRIVE_DATA = '/content/drive/MyDrive/geom_diffusion_data'

# Create symlink from Drive to local data folder
!rm -rf data
!ln -s {DRIVE_DATA} data
print('‚úÖ Created symlink: data -> Google Drive')
!ls -la data/

## Cell 6: Preprocess Dataset

In [None]:
import os
from pathlib import Path

# Check if already preprocessed
train_pkl = Path('data/crossdocked/train_data.pkl')
if train_pkl.exists():
    print(f'‚úÖ Already preprocessed: {train_pkl}')
else:
    # Find raw data folder
    for folder in ['CrossDocked2020', 'crossdocked2020']:
        if Path(f'data/{folder}').exists():
            print(f'üìÇ Found: data/{folder}')
            print('‚è≥ Preprocessing (10-20 min)...')
            !python preprocess_crossdocked.py \
                --data_dir data/{folder} \
                --output_dir data/crossdocked \
                --config configs/debug_t4.yaml
            print('\n‚úÖ Done!')
            break
    else:
        print('‚ùå Raw data not found. Run Cell 4 first.')

## Cell 7: Delete Cache & Verify

In [None]:
import shutil
from pathlib import Path

cache_dir = Path('data/cache')
if cache_dir.exists():
    shutil.rmtree(cache_dir)
    print('üóëÔ∏è Deleted old cache')

train_pkl = Path('data/crossdocked/train_data.pkl')
val_pkl = Path('data/crossdocked/val_data.pkl')

if train_pkl.exists() and val_pkl.exists():
    print(f'‚úÖ Ready to train:')
    print(f'   Train: {train_pkl.stat().st_size / 1e6:.1f} MB')
    print(f'   Val: {val_pkl.stat().st_size / 1e6:.1f} MB')
else:
    print('‚ö†Ô∏è Data not found - will use synthetic')

## Cell 8: Start Training üöÄ

In [None]:
# Checkpoints also save to Drive!
!mkdir -p data/checkpoints
!python train.py --config configs/debug_t4.yaml --checkpoint_dir data/checkpoints

## Cell 9: Resume Training

In [None]:
# Uncomment to resume
# !python train.py --config configs/debug_t4.yaml --resume data/checkpoints/best_model.pt

## Cell 10: Your checkpoints are on Google Drive!

In [None]:
print('üìÅ Checkpoints saved to Google Drive:')
print('   /MyDrive/geom_diffusion_data/checkpoints/')
!ls -la data/checkpoints/