# Generate CE50 Predictions for Human Dataset
## Part of CE50 Integration into PKSmart

This notebook generates CE50 predictions for all human compounds in the Human_PK_data.csv file.

**Outputs:**
- CE50 (collision energy in eV)
- pCE50 = -log10(CE50)
- Confidence score (0-6 scale from applicability domain)

**Author:** Generated with Claude Code
**Date:** 2026-01-07

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem
import warnings
warnings.filterwarnings('ignore')

# Add CE50_prediction to path
sys.path.append('./CE50_prediction')

print("Imports successful!")
print(f"Working directory: {os.getcwd()}")

## 1. Load Human PK Data

In [None]:
# Load human PK data
df_human = pd.read_csv('data/Human_PK_data.csv')

print(f"Total rows in Human_PK_data.csv: {len(df_human)}")
print(f"\nColumns: {df_human.columns.tolist()}")
print(f"\nFirst few rows:")
df_human.head()

In [None]:
# Filter to get human compounds (any with human PK data)
human_data = df_human[
    df_human['human_VDss_L_kg'].notna() | 
    df_human['human_CL_mL_min_kg'].notna() | 
    df_human['human_fup'].notna() |
    df_human['human_mrt'].notna() |
    df_human['human_thalf'].notna()
].copy()

print(f"Number of compounds with human PK data: {len(human_data)}")
print(f"\nHuman PK endpoints coverage:")
print(f"  human_VDss_L_kg: {human_data['human_VDss_L_kg'].notna().sum()} compounds")
print(f"  human_CL_mL_min_kg: {human_data['human_CL_mL_min_kg'].notna().sum()} compounds")
print(f"  human_fup: {human_data['human_fup'].notna().sum()} compounds")
print(f"  human_mrt: {human_data['human_mrt'].notna().sum()} compounds")
print(f"  human_thalf: {human_data['human_thalf'].notna().sum()} compounds")

# Check SMILES column
print(f"\nUnique SMILES: {human_data['smiles_r'].nunique()}")
print(f"Missing SMILES: {human_data['smiles_r'].isna().sum()}")

## 2. Load Pre-trained CE50 Models

In [None]:
# Find the most recent model timestamp
model_dir = 'CE50_prediction/models/'
model_files = [f for f in os.listdir(model_dir) if f.endswith('.pkl')]

# Extract timestamps from filenames
timestamps = list(set([f.split('_')[-1].replace('.pkl', '') for f in model_files if 'metadata' not in f]))
print(f"Available model timestamps: {timestamps}")

# Use the most recent timestamp
if len(timestamps) > 0:
    timestamp = sorted(timestamps)[-1]  # Most recent
    print(f"\nUsing models with timestamp: {timestamp}")
else:
    raise ValueError("No CE50 models found in CE50_prediction/models/")

In [None]:
# Load all 4 ensemble models
models = {}
model_types = ['rf_binary', 'rf_count', 'xgb_binary', 'xgb_count']

for model_type in model_types:
    model_path = f"{model_dir}{model_type}_{timestamp}.pkl"
    if os.path.exists(model_path):
        models[model_type] = joblib.load(model_path)
        print(f"âœ“ Loaded {model_type}")
    else:
        print(f"âœ— Missing {model_type}")

# Load applicability domain
ad_path = f"{model_dir}applicability_domain_{timestamp}.pkl"
if os.path.exists(ad_path):
    applicability_domain = joblib.load(ad_path)
    print(f"âœ“ Loaded applicability domain")
else:
    applicability_domain = None
    print(f"âš  Applicability domain not found, confidence scores will be approximate")

print(f"\nTotal models loaded: {len(models)}")

## 3. Generate Dual Fingerprints for Human Compounds

In [None]:
def generate_binary_fingerprint(smiles, radius=2, n_bits=2048):
    """Generate binary Morgan fingerprint"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

def generate_count_fingerprint(smiles, radius=2, n_bits=2048):
    """Generate count-based Morgan fingerprint"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=np.int32)
    for idx, val in fp.GetNonzeroElements().items():
        arr[idx] = val
    return arr

print("Fingerprint generation functions defined.")

In [None]:
# Generate fingerprints for all human compounds
print("Generating fingerprints...")

binary_fps = []
count_fps = []
valid_indices = []
invalid_smiles = []

for idx, smiles in enumerate(human_data['smiles_r'].values):
    if pd.isna(smiles):
        invalid_smiles.append((idx, smiles, "Missing SMILES"))
        continue
    
    binary_fp = generate_binary_fingerprint(smiles)
    count_fp = generate_count_fingerprint(smiles)
    
    if binary_fp is not None and count_fp is not None:
        binary_fps.append(binary_fp)
        count_fps.append(count_fp)
        valid_indices.append(idx)
    else:
        invalid_smiles.append((idx, smiles, "Invalid SMILES"))

# Convert to arrays
X_binary = np.array(binary_fps)
X_count = np.array(count_fps)

print(f"\nâœ“ Generated fingerprints for {len(valid_indices)} compounds")
print(f"âœ— Failed for {len(invalid_smiles)} compounds")
print(f"\nFingerprint shapes:")
print(f"  Binary: {X_binary.shape}")
print(f"  Count: {X_count.shape}")

if len(invalid_smiles) > 0:
    print(f"\nInvalid SMILES examples:")
    for idx, smiles, reason in invalid_smiles[:5]:
        print(f"  {reason}: {smiles}")

## 4. Make CE50 Predictions with Ensemble

In [None]:
# Make predictions with all 4 models
predictions = {}

if 'rf_binary' in models:
    predictions['rf_binary'] = models['rf_binary'].predict(X_binary)
    print(f"âœ“ RF Binary predictions: {len(predictions['rf_binary'])}")

if 'rf_count' in models:
    predictions['rf_count'] = models['rf_count'].predict(X_count)
    print(f"âœ“ RF Count predictions: {len(predictions['rf_count'])}")

if 'xgb_binary' in models:
    predictions['xgb_binary'] = models['xgb_binary'].predict(X_binary)
    print(f"âœ“ XGB Binary predictions: {len(predictions['xgb_binary'])}")

if 'xgb_count' in models:
    predictions['xgb_count'] = models['xgb_count'].predict(X_count)
    print(f"âœ“ XGB Count predictions: {len(predictions['xgb_count'])}")

print(f"\nTotal models used: {len(predictions)}")

In [None]:
# Calculate ensemble statistics
all_preds = np.array(list(predictions.values())).T  # Shape: (n_compounds, n_models)

# Mean prediction across ensemble
pce50_ensemble_mean = all_preds.mean(axis=1)

# Standard deviation (measure of ensemble disagreement)
pce50_ensemble_std = all_preds.std(axis=1)

# For each compound, select the model with highest confidence
# (Simplified: use mean for now, can enhance with applicability domain later)
pce50_predicted = pce50_ensemble_mean

print(f"Ensemble predictions generated:")
print(f"  Mean pCE50: {pce50_predicted.mean():.3f} Â± {pce50_predicted.std():.3f}")
print(f"  Range: [{pce50_predicted.min():.3f}, {pce50_predicted.max():.3f}]")
print(f"  Ensemble disagreement (std): {pce50_ensemble_std.mean():.3f}")

## 5. Calculate Confidence Scores

In [None]:
# Calculate confidence scores (simplified version)
# Higher confidence when:
# 1. Low ensemble disagreement (low std)
# 2. Within applicability domain (if available)

if applicability_domain is not None:
    # Use applicability domain assessment
    # This would require the full ApplicabilityDomain class
    # For now, use simplified confidence based on ensemble agreement
    print("Using applicability domain for confidence scores (TODO: implement)")
    confidence_scores = np.ones(len(pce50_predicted)) * 3  # Placeholder
else:
    # Simplified confidence: based on ensemble agreement
    # Low std = high confidence
    # Map std to 0-6 scale (inverted)
    
    # Normalize std to 0-1 range
    std_normalized = (pce50_ensemble_std - pce50_ensemble_std.min()) / \
                     (pce50_ensemble_std.max() - pce50_ensemble_std.min() + 1e-10)
    
    # Convert to 0-6 scale (6 = highest confidence)
    confidence_scores = 6 * (1 - std_normalized)
    
    print(f"\nConfidence scores (0-6 scale, based on ensemble agreement):")
    print(f"  Mean: {confidence_scores.mean():.2f}")
    print(f"  Range: [{confidence_scores.min():.2f}, {confidence_scores.max():.2f}]")

# Categorize confidence
confidence_categories = np.where(confidence_scores >= 5, 'High',
                        np.where(confidence_scores >= 3, 'Medium', 'Low'))

print(f"\nConfidence distribution:")
for cat in ['High', 'Medium', 'Low']:
    count = (confidence_categories == cat).sum()
    pct = 100 * count / len(confidence_categories)
    print(f"  {cat}: {count} ({pct:.1f}%)")

## 6. Convert pCE50 to CE50

In [None]:
# Convert pCE50 back to CE50 (eV)
# pCE50 = -log10(CE50) â†’ CE50 = 10^(-pCE50)
ce50_predicted = 10 ** (-pce50_predicted)

print(f"CE50 predictions (eV):")
print(f"  Mean: {ce50_predicted.mean():.2f} eV")
print(f"  Std: {ce50_predicted.std():.2f} eV")
print(f"  Range: [{ce50_predicted.min():.2f}, {ce50_predicted.max():.2f}] eV")
print(f"  Median: {np.median(ce50_predicted):.2f} eV")

# Expected range for small molecules: 10-50 eV typically
in_expected_range = ((ce50_predicted >= 10) & (ce50_predicted <= 50)).sum()
print(f"\n  Predictions in expected range (10-50 eV): {in_expected_range} / {len(ce50_predicted)} ({100*in_expected_range/len(ce50_predicted):.1f}%)")

## 7. Create Results DataFrame

In [None]:
# Get valid human data (only compounds with successful predictions)
human_data_valid = human_data.iloc[valid_indices].copy()

# Add CE50 predictions
human_data_valid['ce50'] = ce50_predicted
human_data_valid['pce50'] = pce50_predicted
human_data_valid['confidence'] = confidence_scores
human_data_valid['confidence_category'] = confidence_categories
human_data_valid['ensemble_std'] = pce50_ensemble_std

# Add individual model predictions for transparency
for model_name, preds in predictions.items():
    human_data_valid[f'{model_name}_pce50'] = preds

print(f"Results DataFrame created with {len(human_data_valid)} compounds")
print(f"\nColumns added: ce50, pce50, confidence, confidence_category, ensemble_std, + individual model predictions")
print(f"\nFirst few predictions:")
display_cols = ['smiles_r', 'ce50', 'pce50', 'confidence', 'confidence_category']
if 'NAME' in human_data_valid.columns:
    display_cols.insert(0, 'NAME')
human_data_valid[display_cols].head(10)

## 8. Save Results

In [None]:
# Save to CSV
output_file = 'data/human_ce50_predictions.csv'
human_data_valid.to_csv(output_file, index=False)
print(f"âœ“ Saved results to: {output_file}")
print(f"  Total rows: {len(human_data_valid)}")
print(f"  Total columns: {len(human_data_valid.columns)}")

# Also save a simplified version with just essential columns
essential_cols = ['smiles_r', 'human_VDss_L_kg', 'human_CL_mL_min_kg',
                  'human_fup', 'human_mrt', 'human_thalf',
                  'ce50', 'pce50', 'confidence', 'confidence_category', 'ensemble_std']

# Only include NAME column if it exists
if 'NAME' in human_data_valid.columns:
    essential_cols.insert(1, 'NAME')

human_ce50_simple = human_data_valid[essential_cols].copy()
human_ce50_simple.to_csv('data/human_ce50_predictions_simple.csv', index=False)
print(f"\nâœ“ Saved simplified version to: data/human_ce50_predictions_simple.csv")

## 9. Visualizations

In [None]:
# Create visualization figure
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('CE50 Predictions for Human Dataset', fontsize=16, fontweight='bold')

# 1. CE50 distribution
ax = axes[0, 0]
ax.hist(ce50_predicted, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax.axvline(ce50_predicted.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {ce50_predicted.mean():.2f} eV')
ax.axvline(np.median(ce50_predicted), color='orange', linestyle='--', linewidth=2, label=f'Median: {np.median(ce50_predicted):.2f} eV')
ax.set_xlabel('CE50 (eV)', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('CE50 Distribution')
ax.legend()
ax.grid(alpha=0.3)

# 2. pCE50 distribution
ax = axes[0, 1]
ax.hist(pce50_predicted, bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
ax.axvline(pce50_predicted.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {pce50_predicted.mean():.3f}')
ax.set_xlabel('pCE50 (-log10[CE50])', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('pCE50 Distribution')
ax.legend()
ax.grid(alpha=0.3)

# 3. Confidence score distribution
ax = axes[0, 2]
confidence_counts = pd.Series(confidence_categories).value_counts()
colors = {'High': 'green', 'Medium': 'orange', 'Low': 'red'}
bars = ax.bar(confidence_counts.index, confidence_counts.values, 
              color=[colors[x] for x in confidence_counts.index], edgecolor='black', alpha=0.7)
ax.set_xlabel('Confidence Category', fontweight='bold')
ax.set_ylabel('Number of Compounds', fontweight='bold')
ax.set_title('Confidence Distribution')
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}\n({100*height/len(confidence_categories):.1f}%)',
            ha='center', va='bottom', fontweight='bold')
ax.grid(alpha=0.3, axis='y')

# 4. Ensemble disagreement (std)
ax = axes[1, 0]
ax.hist(pce50_ensemble_std, bins=30, color='coral', edgecolor='black', alpha=0.7)
ax.axvline(pce50_ensemble_std.mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {pce50_ensemble_std.mean():.3f}')
ax.set_xlabel('Ensemble Std (pCE50 units)', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Ensemble Model Agreement')
ax.legend()
ax.grid(alpha=0.3)

# 5. Model comparison (violin plot)
ax = axes[1, 1]
model_data = [predictions[m] for m in predictions.keys()]
parts = ax.violinplot(model_data, positions=range(len(predictions)), 
                      showmeans=True, showmedians=True)
ax.set_xticks(range(len(predictions)))
ax.set_xticklabels(list(predictions.keys()), rotation=45, ha='right')
ax.set_ylabel('pCE50 (-log10[CE50])', fontweight='bold')
ax.set_title('Model Predictions Comparison')
ax.grid(alpha=0.3, axis='y')

# 6. Confidence vs Ensemble Std
ax = axes[1, 2]
scatter = ax.scatter(pce50_ensemble_std, confidence_scores, 
                    c=confidence_scores, cmap='RdYlGn', s=50, alpha=0.6, edgecolor='black')
ax.set_xlabel('Ensemble Std (pCE50)', fontweight='bold')
ax.set_ylabel('Confidence Score (0-6)', fontweight='bold')
ax.set_title('Confidence vs Ensemble Agreement')
plt.colorbar(scatter, ax=ax, label='Confidence')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('human_ce50_predictions_diagnostics.png', dpi=300, bbox_inches='tight')
print("\nâœ“ Saved visualization: human_ce50_predictions_diagnostics.png")
plt.show()

## 10. Summary Statistics

In [None]:
print("="*80)
print("CE50 PREDICTION SUMMARY FOR HUMAN DATASET")
print("="*80)

print(f"\nðŸ“Š DATASET STATISTICS:")
print(f"  Total compounds in Human_PK_data.csv: {len(df_human)}")
print(f"  Compounds with human PK data: {len(human_data)}")
print(f"  Successful CE50 predictions: {len(human_data_valid)}")
print(f"  Failed predictions: {len(invalid_smiles)}")

print(f"\nðŸŽ¯ CE50 PREDICTIONS:")
print(f"  Mean CE50: {ce50_predicted.mean():.2f} Â± {ce50_predicted.std():.2f} eV")
print(f"  Median CE50: {np.median(ce50_predicted):.2f} eV")
print(f"  Range: [{ce50_predicted.min():.2f}, {ce50_predicted.max():.2f}] eV")
print(f"  25th percentile: {np.percentile(ce50_predicted, 25):.2f} eV")
print(f"  75th percentile: {np.percentile(ce50_predicted, 75):.2f} eV")

print(f"\nðŸ“ˆ pCE50 PREDICTIONS:")
print(f"  Mean pCE50: {pce50_predicted.mean():.3f} Â± {pce50_predicted.std():.3f}")
print(f"  Range: [{pce50_predicted.min():.3f}, {pce50_predicted.max():.3f}]")

print(f"\nðŸŽ“ CONFIDENCE ASSESSMENT:")
for cat in ['High', 'Medium', 'Low']:
    count = (confidence_categories == cat).sum()
    pct = 100 * count / len(confidence_categories)
    avg_conf = confidence_scores[confidence_categories == cat].mean()
    print(f"  {cat:8s}: {count:3d} compounds ({pct:5.1f}%) - Avg score: {avg_conf:.2f}")

print(f"\nðŸ¤– ENSEMBLE STATISTICS:")
print(f"  Models used: {len(predictions)}")
print(f"  Model types: {list(predictions.keys())}")
print(f"  Ensemble disagreement (std): {pce50_ensemble_std.mean():.3f} Â± {pce50_ensemble_std.std():.3f}")
print(f"  Max disagreement: {pce50_ensemble_std.max():.3f}")
print(f"  Min disagreement: {pce50_ensemble_std.min():.3f}")

print(f"\nðŸ’¾ OUTPUT FILES GENERATED:")
print(f"  1. data/human_ce50_predictions.csv (full dataset)")
print(f"  2. data/human_ce50_predictions_simple.csv (essential columns only)")
print(f"  3. human_ce50_predictions_diagnostics.png (visualizations)")

print(f"\nâœ… CE50 prediction for human dataset COMPLETE!")
print(f"\nNext step: Create 03_Predict_human_data_with_CE50.ipynb to retrain human models with these CE50 features")
print("="*80)