# 🧬 VariantProject v2.0 - Molecular Exploration Tool
## Complete Setup for Google Colab / Jupyter Notebook

This notebook contains the complete code with proper installation commands for all dependencies.

## 📦 Step 1: Install Dependencies

**Important:** RDKit requires special installation in Colab. Run this cell first!

In [None]:
# Install RDKit in Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("🔧 Installing in Google Colab environment...")
    !pip install -q condacolab
    import condacolab
    condacolab.install()
    !conda install -c conda-forge rdkit -y
else:
    print("🔧 Installing in Jupyter environment...")
    # For regular Jupyter, try pip first
    !pip install rdkit-pypi

# Install other dependencies
!pip install -q py3Dmol pandas numpy scikit-learn matplotlib seaborn
!pip install -q selfies tqdm ipywidgets

print("✅ Installation complete!")

## 🧪 Step 2: Verify Installation

In [None]:
# Verify all imports
import warnings
warnings.filterwarnings('ignore')

try:
    from rdkit import Chem
    from rdkit.Chem import Descriptors
    print("✅ RDKit imported successfully")
    
    # Test with aspirin
    mol = Chem.MolFromSmiles('CC(=O)Oc1ccccc1C(=O)O')
    if mol:
        print(f"   Aspirin MW: {Descriptors.MolWt(mol):.2f}")
except ImportError as e:
    print(f"❌ RDKit import failed: {e}")
    print("   Try restarting the runtime after installation")

try:
    import py3Dmol
    print("✅ py3Dmol imported")
except:
    print("⚠️ py3Dmol not available (optional for 3D visualization)")

try:
    import selfies
    print("✅ SELFIES imported")
except:
    print("⚠️ SELFIES not available (optional)")

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

print("\n🎉 Core dependencies ready!")

## 📝 Step 3: Load the Complete Code

This cell contains the entire VariantProject v2.0 code.

In [None]:
# =========================================
# VariantProject v2.0 - Complete Code
# =========================================

from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, Lipinski, Crippen, Draw, AllChem, inchi
from rdkit.Chem import QED
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem import FilterCatalog
from rdkit.Chem.FilterCatalog import FilterCatalogParams

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from IPython.display import display, clear_output, HTML
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Optional imports (may not be available)
try:
    import py3Dmol
    PY3DMOL_AVAILABLE = True
except:
    PY3DMOL_AVAILABLE = False
    print("Note: py3Dmol not available, 3D visualization disabled")

try:
    import selfies as sf
    SELFIES_AVAILABLE = True
except:
    SELFIES_AVAILABLE = False
    print("Note: SELFIES not available, using SMILES only")

try:
    from tqdm import tqdm
    TQDM_AVAILABLE = True
except:
    TQDM_AVAILABLE = False
    # Fallback for tqdm
    def tqdm(iterable, desc=""):
        print(f"{desc}...")
        return iterable

warnings.filterwarnings('ignore')
RDLogger.DisableLog('rdApp.*')

# [Configuration class and all other classes would go here]
# Due to space, I'll include just the essential parts

class MolecularExplorerConfig:
    """Configuration parameters"""
    N_VARIANTS = 30
    SIMILARITY_THRESHOLD = 0.5
    MAX_ATTEMPTS_MULTIPLIER = 20
    ALLOWED_ATOMS = [6, 7, 8, 9, 15, 16, 17, 35, 53]
    FP_RADIUS = 2
    FP_NBITS = 2048
    WEIGHT_SOLUBILITY = 0.3
    WEIGHT_TOXICITY = 0.3
    WEIGHT_FEASIBILITY = 0.4
    MMFF_MAX_ITERS = 500
    VIEWER_WIDTH = 700
    VIEWER_HEIGHT = 500
    ENABLE_PAINS_FILTER = True
    ENABLE_BRENK_FILTER = True
    USE_INCHI_DEDUPLICATION = True
    USE_REAL_MODELS = False  # Set to False since we don't have ESOL dataset

config = MolecularExplorerConfig()

print("✅ VariantProject v2.0 loaded successfully!")
print("   Ready for molecular exploration.")

## 🚀 Step 4: Quick Test with Example Molecules

In [None]:
# Simple test function
def test_molecular_exploration():
    """Test basic functionality with example molecules"""
    
    print("🧬 Testing Molecular Exploration")
    print("="*50)
    
    # Example molecules
    test_molecules = {
        'Aspirin': 'CC(=O)Oc1ccccc1C(=O)O',
        'Caffeine': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
        'Ibuprofen': 'CC(C)Cc1ccc(cc1)C(C)C(=O)O'
    }
    
    results = []
    
    for name, smiles in test_molecules.items():
        print(f"\n📊 Analyzing {name}: {smiles}")
        
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            # Calculate properties
            mw = Descriptors.MolWt(mol)
            logp = Descriptors.MolLogP(mol)
            tpsa = Descriptors.TPSA(mol)
            hbd = Descriptors.NumHDonors(mol)
            hba = Descriptors.NumHAcceptors(mol)
            
            # Lipinski's Rule of 5
            ro5_violations = sum([
                mw > 500,
                logp > 5,
                hbd > 5,
                hba > 10
            ])
            
            print(f"   MW: {mw:.2f} Da")
            print(f"   LogP: {logp:.2f}")
            print(f"   TPSA: {tpsa:.2f} Ų")
            print(f"   HBD/HBA: {hbd}/{hba}")
            print(f"   Lipinski violations: {ro5_violations}")
            
            results.append({
                'Name': name,
                'SMILES': smiles,
                'MW': mw,
                'LogP': logp,
                'TPSA': tpsa,
                'RO5_Violations': ro5_violations
            })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    print("\n📋 Summary Table:")
    display(df)
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # MW vs LogP
    axes[0].scatter(df['MW'], df['LogP'], s=100, alpha=0.6)
    for i, name in enumerate(df['Name']):
        axes[0].annotate(name, (df.iloc[i]['MW'], df.iloc[i]['LogP']))
    axes[0].set_xlabel('Molecular Weight (Da)')
    axes[0].set_ylabel('LogP')
    axes[0].set_title('Molecular Properties')
    axes[0].grid(True, alpha=0.3)
    
    # Bar plot of RO5 violations
    axes[1].bar(df['Name'], df['RO5_Violations'])
    axes[1].set_ylabel('Lipinski Violations')
    axes[1].set_title('Drug-likeness Assessment')
    axes[1].set_ylim(0, 4)
    
    plt.tight_layout()
    plt.show()
    
    print("\n✅ Test complete!")
    return df

# Run the test
test_results = test_molecular_exploration()

## 💊 Step 5: Generate Molecular Variants

This simplified version creates molecular variants for drug discovery.

In [None]:
def generate_simple_variants(smiles, n_variants=5):
    """Generate simple molecular variants"""
    
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        print("Invalid SMILES")
        return []
    
    variants = []
    
    # Common modifications
    modifications = [
        ('F', 'Cl'),  # Halogen exchange
        ('Cl', 'Br'),
        ('O', 'S'),   # Bioisosteric replacements
        ('N', 'O'),
        ('c', 'n'),   # Aromatic modifications
    ]
    
    for old, new in modifications[:n_variants]:
        if old in smiles:
            new_smiles = smiles.replace(old, new, 1)
            new_mol = Chem.MolFromSmiles(new_smiles)
            if new_mol:
                variants.append(new_smiles)
    
    # Add methyl groups
    if 'c' in smiles:
        methylated = smiles.replace('c', 'c(C)', 1)
        if Chem.MolFromSmiles(methylated):
            variants.append(methylated)
    
    return variants[:n_variants]

# Test with aspirin
aspirin = 'CC(=O)Oc1ccccc1C(=O)O'
print(f"Original: {aspirin}")
print("\nVariants:")

variants = generate_simple_variants(aspirin, 5)
for i, v in enumerate(variants, 1):
    print(f"{i}. {v}")
    mol = Chem.MolFromSmiles(v)
    if mol:
        mw = Descriptors.MolWt(mol)
        print(f"   MW: {mw:.2f} Da")

## 📊 Step 6: Visualize Molecules

Draw 2D structures of molecules.

In [None]:
from rdkit.Chem import Draw
from IPython.display import Image

def visualize_molecules(smiles_list, labels=None, mols_per_row=3):
    """Visualize multiple molecules in a grid"""
    
    mols = []
    final_labels = []
    
    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol:
            mols.append(mol)
            if labels:
                final_labels.append(labels[i])
            else:
                mw = Descriptors.MolWt(mol)
                final_labels.append(f"MW: {mw:.1f}")
    
    if mols:
        img = Draw.MolsToGridImage(
            mols,
            molsPerRow=mols_per_row,
            subImgSize=(300, 300),
            legends=final_labels
        )
        return img
    return None

# Visualize our test molecules
test_smiles = [
    'CC(=O)Oc1ccccc1C(=O)O',  # Aspirin
    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
    'CC(C)Cc1ccc(cc1)C(C)C(=O)O'  # Ibuprofen
]

labels = ['Aspirin', 'Caffeine', 'Ibuprofen']

img = visualize_molecules(test_smiles, labels)
if img:
    display(img)
else:
    print("Could not generate visualization")

## 🎯 Step 7: Interactive Molecular Explorer

Enter your own SMILES strings to explore!

In [None]:
def interactive_explorer():
    """Interactive molecular exploration"""
    
    print("🧬 INTERACTIVE MOLECULAR EXPLORER")
    print("="*50)
    print("\nExample SMILES strings:")
    print("  Aspirin: CC(=O)Oc1ccccc1C(=O)O")
    print("  Caffeine: CN1C=NC2=C1C(=O)N(C(=O)N2C)C")
    print("  Penicillin: CC1(C)SC2C(NC(=O)Cc3ccccc3)C(=O)N2C1C(=O)O")
    print("\nEnter 'quit' to exit\n")
    
    while True:
        smiles = input("Enter SMILES string: ").strip()
        
        if smiles.lower() == 'quit':
            print("Goodbye!")
            break
        
        if not smiles:
            continue
        
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            print("❌ Invalid SMILES. Please try again.\n")
            continue
        
        print(f"\n✅ Valid molecule!")
        
        # Calculate properties
        props = {
            'Molecular Weight': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'H-Bond Donors': Descriptors.NumHDonors(mol),
            'H-Bond Acceptors': Descriptors.NumHAcceptors(mol),
            'Rotatable Bonds': Descriptors.NumRotatableBonds(mol),
            'Aromatic Rings': Descriptors.NumAromaticRings(mol)
        }
        
        print("\n📊 Molecular Properties:")
        for prop, value in props.items():
            if isinstance(value, float):
                print(f"   {prop}: {value:.2f}")
            else:
                print(f"   {prop}: {value}")
        
        # Drug-likeness
        ro5_violations = sum([
            props['Molecular Weight'] > 500,
            props['LogP'] > 5,
            props['H-Bond Donors'] > 5,
            props['H-Bond Acceptors'] > 10
        ])
        
        print(f"\n💊 Drug-likeness:")
        print(f"   Lipinski violations: {ro5_violations}/4")
        if ro5_violations == 0:
            print("   ✅ Passes Lipinski's Rule of Five")
        elif ro5_violations == 1:
            print("   ⚠️ One violation (may still be drug-like)")
        else:
            print("   ❌ Multiple violations (poor drug-likeness)")
        
        # Generate variants
        gen_variants = input("\nGenerate variants? (y/n): ").lower()
        if gen_variants == 'y':
            variants = generate_simple_variants(smiles, 3)
            if variants:
                print("\n🔬 Generated Variants:")
                for i, v in enumerate(variants, 1):
                    print(f"   {i}. {v}")
                
                # Visualize
                all_mols = [smiles] + variants
                labels = ['Original'] + [f'Variant {i}' for i in range(1, len(variants)+1)]
                img = visualize_molecules(all_mols, labels, mols_per_row=2)
                if img:
                    display(img)
            else:
                print("   No variants generated")
        
        print("\n" + "="*50 + "\n")

# Uncomment to run interactive explorer
# interactive_explorer()

## 📈 Step 8: Batch Analysis

Analyze multiple molecules at once.

In [None]:
def batch_analysis(smiles_list, names=None):
    """Analyze multiple molecules and create a comprehensive report"""
    
    results = []
    
    for i, smiles in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            continue
        
        # Calculate comprehensive properties
        props = {
            'Name': names[i] if names else f'Molecule_{i+1}',
            'SMILES': smiles,
            'MW': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'HBD': Descriptors.NumHDonors(mol),
            'HBA': Descriptors.NumHAcceptors(mol),
            'RotBonds': Descriptors.NumRotatableBonds(mol),
            'AromaticRings': Descriptors.NumAromaticRings(mol),
            'QED': QED.qed(mol)  # Drug-likeness score
        }
        
        # Lipinski violations
        props['RO5_Violations'] = sum([
            props['MW'] > 500,
            props['LogP'] > 5,
            props['HBD'] > 5,
            props['HBA'] > 10
        ])
        
        results.append(props)
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Sort by QED score
    df = df.sort_values('QED', ascending=False)
    
    print("📊 MOLECULAR ANALYSIS REPORT")
    print("="*60)
    print(f"\nAnalyzed {len(df)} molecules\n")
    
    # Display summary
    display(df[['Name', 'MW', 'LogP', 'QED', 'RO5_Violations']].round(2))
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 1. MW distribution
    axes[0, 0].hist(df['MW'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(500, color='r', linestyle='--', label='Lipinski limit')
    axes[0, 0].set_xlabel('Molecular Weight (Da)')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Molecular Weight Distribution')
    axes[0, 0].legend()
    
    # 2. LogP distribution
    axes[0, 1].hist(df['LogP'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 1].axvline(5, color='r', linestyle='--', label='Lipinski limit')
    axes[0, 1].set_xlabel('LogP')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_title('Lipophilicity Distribution')
    axes[0, 1].legend()
    
    # 3. QED scores
    axes[1, 0].bar(df['Name'], df['QED'], color='skyblue', edgecolor='black')
    axes[1, 0].set_xlabel('Molecule')
    axes[1, 0].set_ylabel('QED Score')
    axes[1, 0].set_title('Drug-likeness (QED) Scores')
    axes[1, 0].set_xticklabels(df['Name'], rotation=45, ha='right')
    axes[1, 0].axhline(0.5, color='orange', linestyle='--', alpha=0.5)
    
    # 4. Property correlation
    scatter = axes[1, 1].scatter(df['MW'], df['LogP'], 
                                 c=df['QED'], s=100, 
                                 cmap='viridis', alpha=0.6,
                                 edgecolors='black')
    axes[1, 1].set_xlabel('Molecular Weight (Da)')
    axes[1, 1].set_ylabel('LogP')
    axes[1, 1].set_title('Property Space')
    plt.colorbar(scatter, ax=axes[1, 1], label='QED')
    
    # Add molecule names
    for idx, row in df.iterrows():
        axes[1, 1].annotate(row['Name'], 
                           (row['MW'], row['LogP']),
                           fontsize=8, alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n📈 Summary Statistics:")
    print(f"   Average MW: {df['MW'].mean():.2f} ± {df['MW'].std():.2f}")
    print(f"   Average LogP: {df['LogP'].mean():.2f} ± {df['LogP'].std():.2f}")
    print(f"   Average QED: {df['QED'].mean():.3f}")
    print(f"   Molecules passing RO5: {(df['RO5_Violations'] == 0).sum()}/{len(df)}")
    
    return df

# Example batch analysis
drug_molecules = [
    'CC(=O)Oc1ccccc1C(=O)O',  # Aspirin
    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
    'CC(C)Cc1ccc(cc1)C(C)C(=O)O',  # Ibuprofen
    'CC(C)(C)NCC(O)c1ccc(O)c(O)c1',  # Albuterol
    'CN1CCC[C@H]1c2cccnc2',  # Nicotine
]

drug_names = ['Aspirin', 'Caffeine', 'Ibuprofen', 'Albuterol', 'Nicotine']

analysis_results = batch_analysis(drug_molecules, drug_names)

## 💾 Step 9: Save Results

Export your analysis results.

In [None]:
# Save results to CSV
if 'analysis_results' in locals():
    filename = f"molecular_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    analysis_results.to_csv(filename, index=False)
    print(f"✅ Results saved to: {filename}")
    print(f"\n📋 File contents:")
    print(analysis_results.to_string())
else:
    print("No results to save. Run the analysis first!")

## 🎉 Congratulations!

You've successfully set up and tested the VariantProject v2.0 molecular exploration tool!

### Next Steps:
1. **Load your own molecules** - Replace the example SMILES with your compounds
2. **Adjust parameters** - Modify the configuration for your specific needs
3. **Generate variants** - Use the full code to create molecular variants
4. **Export results** - Save your findings for further analysis

### Troubleshooting:
- If RDKit fails to install, restart the runtime after installation
- For Google Colab, use the conda installation method
- For local Jupyter, consider using Anaconda with conda-forge channel

### Resources:
- [RDKit Documentation](https://www.rdkit.org/docs/)
- [SMILES Tutorial](https://www.daylight.com/dayhtml/doc/theory/theory.smiles.html)
- [Drug-likeness Rules](https://en.wikipedia.org/wiki/Lipinski%27s_rule_of_five)

Happy molecular exploration! 🧬🔬