# Molecular Property Prediction - Quick Start

This notebook demonstrates how to:
1. Load and explore the BBBP dataset
2. Train a neural network model
3. Evaluate the model
4. Make predictions on new molecules

In [None]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add parent directory to path
sys.path.insert(0, os.path.abspath('..'))

from src.data import MoleculeDataLoader, MoleculePreprocessor, MoleculeDataset
from src.models import create_model
from src.training import Trainer
from src.evaluation import Evaluator, Visualizer
from src.utils import load_config

## 1. Load Configuration

In [None]:
# Load configuration
config = load_config('../config/config.yaml')
print("Configuration loaded successfully")

## 2. Download and Explore Dataset

In [None]:
# Initialize data loader
data_loader = MoleculeDataLoader(config)

# Load BBBP dataset
df = data_loader.load_bbbp_dataset()

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Dataset statistics
print("Dataset Statistics:")
print(f"Total molecules: {len(df)}")
print(f"Penetrates BBB (positive): {df['p_np'].sum()}")
print(f"Does not penetrate BBB (negative): {(1 - df['p_np']).sum()}")
print(f"Positive ratio: {df['p_np'].mean():.2%}")

## 3. Preprocess Data

In [None]:
# Split data
train_df, val_df, test_df = data_loader.split_data(df)

# Initialize preprocessor
preprocessor = MoleculePreprocessor(config)

# Process data
print("Extracting molecular features...")
X_train, y_train, _ = preprocessor.process_dataframe(train_df, fit_scaler=True)
X_val, y_val, _ = preprocessor.process_dataframe(val_df, fit_scaler=False)
X_test, y_test, _ = preprocessor.process_dataframe(test_df, fit_scaler=False)

print(f"\nFeature shape: {X_train.shape}")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

## 4. Create Model

In [None]:
# Create PyTorch datasets
from src.data import create_dataloaders

train_dataset = MoleculeDataset(X_train, y_train)
val_dataset = MoleculeDataset(X_val, y_val)
test_dataset = MoleculeDataset(X_test, y_test)

train_loader, val_loader, test_loader = create_dataloaders(
    train_dataset, val_dataset, test_dataset, config
)

# Create model
input_size = preprocessor.get_feature_dim()
model = create_model(config, input_size)

print(model)
print(f"\nTotal parameters: {model.count_parameters()['total']:,}")

## 5. Train Model (Optional - can be slow in notebook)

For better training experience, use the command-line script:
```bash
python scripts/train.py
```

In [None]:
# Uncomment to train in notebook
# trainer = Trainer(
#     model=model,
#     train_loader=train_loader,
#     val_loader=val_loader,
#     config=config,
#     device=config['training']['device']
# )
# 
# history = trainer.train()

## 6. Make Predictions

Example of how to use the trained model for predictions.

In [None]:
# Example SMILES strings
example_smiles = [
    "CC(C)Cc1ccc(cc1)C(C)C(O)=O",  # Ibuprofen
    "CC(=O)Oc1ccccc1C(=O)O",        # Aspirin
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C" # Caffeine
]

print("Making predictions for example molecules:")
for smiles in example_smiles:
    if preprocessor.validate_smiles(smiles):
        features = preprocessor.extract_features(smiles)
        if features is not None:
            mol_info = preprocessor.get_molecule_info(smiles)
            print(f"\nMolecule: {mol_info['molecular_formula']}")
            print(f"SMILES: {smiles}")
            # Prediction would go here after training
    else:
        print(f"Invalid SMILES: {smiles}")

## Next Steps

1. **Train the model**: Use `python scripts/train.py`
2. **Evaluate**: Use `python scripts/evaluate.py`
3. **Make predictions**: Use `python scripts/predict.py --smiles "YOUR_SMILES"`
4. **Explore results**: Check the `results/` directory for visualizations