# 🧪 Comprehensive LMSD SDF Database search
Creates a comprehensive analytical pipeline to extract maximum insights from this valuable dataset.
LMSD.sdf/structures.sdf

🔍 Setting Up Your Search Notebook


In [1]:
# LIPID MAPS Database Search Notebook
# ====================================

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, PandasTools, AllChem
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

print("🔬 LIPID MAPS Database Search System")
print("="*50)

🔬 LIPID MAPS Database Search System


## 📥 Loading the LIPID MAPS SDF Database



In [None]:
class LipidMapsSearcher:
    def __init__(self, sdf_path):
        self.sdf_path = sdf_path
        self.df = None
        self.lipid_categories = {}
        
    def load_lipid_database(self, sample_size=None):
        """Load LIPID MAPS SDF database with lipid-specific properties"""
        print(f"📂 Loading LIPID MAPS database: {self.sdf_path}")
        
        try:
            # Load SDF using RDKit's PandasTools
            self.df = PandasTools.LoadSDF(self.sdf_path, 
                                         molColName='Molecule',
                                         smilesName='SMILES',
                                         includeFingerprints=False)
            
            print(f"✅ Successfully loaded {len(self.df)} lipid structures")
            
            # Sample if requested
            if sample_size and sample_size < len(self.df):
                self.df = self.df.sample(sample_size, random_state=42)
                print(f"📊 Using sample of {sample_size} lipids")
            
            # Basic preprocessing
            self._preprocess_lipid_data()
            self._analyze_lipid_categories()
            
            return True
            
        except Exception as e:
            print(f"❌ Error loading SDF file: {e}")
            return False
    
    def _preprocess_lipid_data(self):
        """Preprocess and enhance lipid data"""
        print("🔄 Preprocessing lipid data...")
        
        # Add calculated molecular properties
        self.df['Molecular_Weight'] = self.df['Molecule'].apply(
            lambda x: Descriptors.MolWt(x) if x else None
        )
        self.df['LogP'] = self.df['Molecule'].apply(
            lambda x: Descriptors.MolLogP(x) if x else None
        )
        self.df['TPSA'] = self.df['Molecule'].apply(
            lambda x: Descriptors.TPSA(x) if x else None
        )
        self.df['HBD'] = self.df['Molecule'].apply(
            lambda x: Descriptors.NumHDonors(x) if x else None
        )
        self.df['HBA'] = self.df['Molecule'].apply(
            lambda x: Descriptors.NumHAcceptors(x) if x else None
        )
        
        # Generate SMILES if not present
        if 'SMILES' not in self.df.columns:
            self.df['SMILES'] = self.df['Molecule'].apply(
                lambda x: Chem.MolToSmiles(x) if x else None
            )
        
        # Clean column names (SDF properties might have spaces)
        self.df.columns = [col.replace(' ', '_') for col in self.df.columns]
        
        print("✅ Data preprocessing complete")
    
    def _analyze_lipid_categories(self):
        """Analyze and categorize lipids based on LIPID MAPS classification"""
        print("🔍 Analyzing lipid categories...")
        
        # Check for common LIPID MAPS classification columns
        category_columns = []
        for col in self.df.columns:
            col_lower = col.lower()
            if any(term in col_lower for term in ['category', 'class', 'type', 'lm_id', 'abbreviation']):
                category_columns.append(col)
        
        print(f"📋 Found {len(category_columns)} potential classification columns: {category_columns}")
        
        # Analyze the most informative classification column
        for col in category_columns:
            unique_vals = self.df[col].nunique()
            print(f"   {col}: {unique_vals} unique values")
            
            if unique_vals < 50 and unique_vals > 1:  # Reasonable number of categories
                self.primary_category = col
                self.lipid_categories = self.df[col].value_counts().to_dict()
                print(f"🎯 Using '{col}' as primary classification ({len(self.lipid_categories)} categories)")
                break
        
        # If no good category found, use the first one
        if not hasattr(self, 'primary_category') and category_columns:
            self.primary_category = category_columns[0]
            self.lipid_categories = self.df[self.primary_category].value_counts().to_dict()
            print(f"⚠️  Defaulting to '{self.primary_category}' as primary classification")

# Initialize and load the database
lipid_searcher = LipidMapsSearcher("LMSD.sdf/structures.sdf")
success = lipid_searcher.load_lipid_database(sample_size=10000)  # Load first 10k for quick testing

if success:
    print(f"📊 Database columns: {list(lipid_searcher.df.columns)}")
    print(f"🧪 Sample lipids loaded: {len(lipid_searcher.df)}")

📂 Loading LIPID MAPS database: LMSD.sdf/structures.sdf




✅ Successfully loaded 49267 lipid structures
📊 Using sample of 10000 lipids
🔄 Preprocessing lipid data...


## 🔍 Basic Database Exploration


In [None]:
def explore_lipid_database(lipid_searcher):
    """Comprehensive exploration of the lipid database with error handling"""
    
    df = lipid_searcher.df
    
    print("📈 LIPID DATABASE EXPLORATION")
    print("="*50)
    
    # Basic statistics
    print(f"Total lipids: {len(df):,}")
    print(f"Columns available: {len(df.columns)}")
    
    # Display available properties with safe formatting
    print("\n📋 AVAILABLE PROPERTIES:")
    for col in df.columns:
        try:
            non_null = df[col].notna().sum()
            dtype = str(df[col].dtype)  # Convert dtype to string for safe formatting
            percentage = non_null / len(df) * 100
            print(f"   {col:<25} {dtype:<15} {non_null:>6} non-null ({percentage:.1f}%)")
        except Exception as e:
            print(f"   {col:<25} [Error: {str(e)[:20]}]")
    
    # Molecular property statistics
    numeric_cols = ['Molecular_Weight', 'LogP', 'TPSA', 'HBD', 'HBA']
    available_numeric = [col for col in numeric_cols if col in df.columns]
    
    if available_numeric:
        print(f"\n📊 MOLECULAR PROPERTY STATISTICS:")
        try:
            stats = df[available_numeric].describe()
            display(stats.round(2))
        except Exception as e:
            print(f"❌ Error calculating statistics: {e}")
    
    # Show actual column names for debugging
    print(f"\n🔍 ACTUAL COLUMN NAMES (first 15):")
    for i, col in enumerate(df.columns[:15]):
        print(f"   {i+1:2d}. {col}")
    
    if len(df.columns) > 15:
        print(f"   ... and {len(df.columns) - 15} more columns")
    
    # Sample data preview
    print(f"\n👀 SAMPLE DATA PREVIEW:")
    try:
        # Select a few representative columns for preview
        preview_cols = []
        for col in df.columns:
            if any(keyword in col.lower() for keyword in ['name', 'id', 'category', 'class', 'mw', 'weight']):
                preview_cols.append(col)
            if len(preview_cols) >= 5:
                break
        
        if preview_cols:
            display(df[preview_cols].head(3))
        else:
            # If no specific columns found, show first 5 columns
            display(df.iloc[:, :5].head(3))
    except Exception as e:
        print(f"❌ Error displaying sample: {e}")

# Re-run the exploration
if success:
    explore_lipid_database(lipid_searcher)

## 🔧 Enhanced Database Loading with Better Error Handling


In [None]:
class LipidMapsSearcher:
    def __init__(self, sdf_path):
        self.sdf_path = sdf_path
        self.df = None
        self.lipid_categories = {}
        
    def load_lipid_database(self, sample_size=None):
        """Load LIPID MAPS SDF database with better error handling"""
        print(f"📂 Loading LIPID MAPS database: {self.sdf_path}")
        
        try:
            # Load SDF using RDKit's PandasTools
            self.df = PandasTools.LoadSDF(self.sdf_path, 
                                         molColName='Molecule',
                                         smilesName='SMILES',
                                         includeFingerprints=False,
                                         strictParsing=False)  # More lenient parsing
            
            print(f"✅ Successfully loaded {len(self.df)} lipid structures")
            
            # Sample if requested
            if sample_size and sample_size < len(self.df):
                self.df = self.df.sample(sample_size, random_state=42)
                print(f"📊 Using sample of {sample_size} lipids")
            
            # Basic preprocessing
            self._preprocess_lipid_data()
            self._analyze_lipid_categories()
            
            return True
            
        except Exception as e:
            print(f"❌ Error loading SDF file: {e}")
            # Try alternative loading method
            return self._load_sdf_alternative(sample_size)
    
    def _load_sdf_alternative(self, sample_size=None):
        """Alternative SDF loading method"""
        print("🔄 Trying alternative SDF loading method...")
        try:
            from rdkit.Chem import SDMolSupplier
            
            suppl = SDMolSupplier(self.sdf_path, strictParsing=False)
            molecules = []
            properties_list = []
            
            for i, mol in enumerate(suppl):
                if mol is not None:
                    # Extract properties
                    mol_props = {}
                    mol_props['Molecule'] = mol
                    
                    # Extract all properties from the molecule
                    prop_names = mol.GetPropNames()
                    for prop in prop_names:
                        try:
                            mol_props[prop] = mol.GetProp(prop)
                        except:
                            mol_props[prop] = None
                    
                    molecules.append(mol)
                    properties_list.append(mol_props)
                    
                    if sample_size and len(molecules) >= sample_size:
                        break
            
            self.df = pd.DataFrame(properties_list)
            print(f"✅ Alternative method loaded {len(self.df)} lipids")
            
            # Basic preprocessing
            self._preprocess_lipid_data()
            self._analyze_lipid_categories()
            
            return True
            
        except Exception as e:
            print(f"❌ Alternative loading also failed: {e}")
            return False
    
    def _preprocess_lipid_data(self):
        """Preprocess and enhance lipid data with error handling"""
        print("🔄 Preprocessing lipid data...")
        
        # Add calculated molecular properties with error handling
        def safe_descriptor(func, mol):
            try:
                return func(mol) if mol else None
            except:
                return None
        
        self.df['Molecular_Weight'] = self.df['Molecule'].apply(
            lambda x: safe_descriptor(Descriptors.MolWt, x)
        )
        self.df['LogP'] = self.df['Molecule'].apply(
            lambda x: safe_descriptor(Descriptors.MolLogP, x)
        )
        self.df['TPSA'] = self.df['Molecule'].apply(
            lambda x: safe_descriptor(Descriptors.TPSA, x)
        )
        self.df['HBD'] = self.df['Molecule'].apply(
            lambda x: safe_descriptor(Descriptors.NumHDonors, x)
        )
        self.df['HBA'] = self.df['Molecule'].apply(
            lambda x: safe_descriptor(Descriptors.NumHAcceptors, x)
        )
        
        # Generate SMILES if not present
        if 'SMILES' not in self.df.columns:
            self.df['SMILES'] = self.df['Molecule'].apply(
                lambda x: Chem.MolToSmiles(x) if x else None
            )
        
        # Clean column names (SDF properties might have spaces)
        self.df.columns = [str(col).replace(' ', '_') for col in self.df.columns]
        
        print("✅ Data preprocessing complete")
    
    def _analyze_lipid_categories(self):
        """Analyze and categorize lipids with better error handling"""
        print("🔍 Analyzing lipid categories...")
        
        # Check for common LIPID MAPS classification columns
        category_columns = []
        for col in self.df.columns:
            try:
                col_lower = str(col).lower()
                if any(term in col_lower for term in ['category', 'class', 'type', 'lm_id', 'abbreviation', 'family']):
                    category_columns.append(col)
            except:
                continue
        
        print(f"📋 Found {len(category_columns)} potential classification columns: {category_columns}")
        
        # Analyze the most informative classification column
        for col in category_columns:
            try:
                unique_vals = self.df[col].nunique()
                print(f"   {col}: {unique_vals} unique values")
                
                if unique_vals < 50 and unique_vals > 1:  # Reasonable number of categories
                    self.primary_category = col
                    self.lipid_categories = self.df[col].value_counts().to_dict()
                    print(f"🎯 Using '{col}' as primary classification ({len(self.lipid_categories)} categories)")
                    break
            except Exception as e:
                print(f"   {col}: Error analyzing - {e}")
        
        # If no good category found, use the first one that works
        if not hasattr(self, 'primary_category') and category_columns:
            for col in category_columns:
                try:
                    self.primary_category = col
                    self.lipid_categories = self.df[col].value_counts().to_dict()
                    print(f"⚠️  Defaulting to '{col}' as primary classification")
                    break
                except:
                    continue

# Re-initialize and load with improved error handling
lipid_searcher = LipidMapsSearcher("LMSD.sdf/structures.sdf")
success = lipid_searcher.load_lipid_database(sample_size=10000)

if success:
    explore_lipid_database(lipid_searcher)

##  🔍 Quick Debugging Function


In [None]:
def debug_database_structure(lipid_searcher):
    """Debug function to understand the database structure"""
    
    df = lipid_searcher.df
    
    print("🐛 DATABASE DEBUG INFORMATION")
    print("="*50)
    
    # Check data types
    print("📊 DATA TYPES:")
    for col in df.columns[:10]:  # First 10 columns
        dtype = df[col].dtype
        sample_value = "N/A"
        try:
            non_null = df[col].dropna()
            if len(non_null) > 0:
                sample_value = str(non_null.iloc[0])[:50]  # First 50 chars
        except:
            sample_value = "Error"
        
        print(f"   {col:<25} {str(dtype):<15} Sample: {sample_value}")
    
    # Check for LIPID MAPS specific columns
    print(f"\n🔍 LIPID MAPS SPECIFIC COLUMNS:")
    lipid_columns = []
    for col in df.columns:
        col_str = str(col).lower()
        if any(term in col_str for term in ['lipid', 'lm', 'lmsd', 'category', 'class', 'fa', 'gl', 'gp', 'sp', 'st']):
            lipid_columns.append(col)
    
    for col in lipid_columns[:10]:  # Show first 10 lipid-related columns
        unique_count = df[col].nunique() if df[col].notna().any() else 0
        print(f"   {col:<25} {unique_count:>3} unique values")
    
    # Check molecule column
    if 'Molecule' in df.columns:
        print(f"\n🧪 MOLECULE COLUMN INFO:")
        valid_molecules = df['Molecule'].notna().sum()
        print(f"   Valid molecules: {valid_molecules}/{len(df)} ({valid_molecules/len(df)*100:.1f}%)")
        
        if valid_molecules > 0:
            sample_mol = df['Molecule'].dropna().iloc[0]
            print(f"   Sample molecule atoms: {sample_mol.GetNumAtoms()}")
            print(f"   Sample molecule SMILES: {Chem.MolToSmiles(sample_mol)[:50]}...")

# Run debugging
if success:
    debug_database_structure(lipid_searcher)

## 🎯 Focused LIPID MAPS Analysis
Since this is specifically LIPID MAPS data, let's look for LIPID MAPS-specific patterns:



In [None]:
def analyze_lipid_maps_specifics(lipid_searcher):
    """Analyze LIPID MAPS specific data patterns"""
    
    df = lipid_searcher.df
    
    print("🎯 LIPID MAPS SPECIFIC ANALYSIS")
    print("="*50)
    
    # Look for LM_ID pattern (LIPID MAPS identifier)
    lm_id_columns = [col for col in df.columns if 'lm' in str(col).lower() and 'id' in str(col).lower()]
    if lm_id_columns:
        print(f"📋 LM_ID COLUMNS FOUND: {lm_id_columns}")
        for col in lm_id_columns:
            sample_ids = df[col].dropna().head(3).tolist()
            print(f"   {col}: {sample_ids}")
    
    # Look for lipid category abbreviations (FA, GL, GP, SP, ST, PR, SL, PK)
    lipid_abbreviations = ['FA', 'GL', 'GP', 'SP', 'ST', 'PR', 'SL', 'PK']
    found_abbreviations = {}
    
    for col in df.columns:
        try:
            for abbr in lipid_abbreviations:
                if df[col].astype(str).str.contains(abbr).any():
                    count = df[col].astype(str).str.contains(abbr).sum()
                    if count > 0:
                        found_abbreviations[abbr] = found_abbreviations.get(abbr, 0) + count
                        print(f"   Found '{abbr}' in column '{col}': {count} occurrences")
        except:
            continue
    
    # Display most common columns by unique values
    print(f"\n📊 COLUMNS WITH MOST UNIQUE VALUES:")
    column_uniques = []
    for col in df.columns:
        try:
            unique_count = df[col].nunique()
            column_uniques.append((col, unique_count))
        except:
            continue
    
    # Sort by unique count and show top 10
    column_uniques.sort(key=lambda x: x[1], reverse=True)
    for col, count in column_uniques[:10]:
        print(f"   {col:<25} {count:>6} unique values")

# Run LIPID MAPS specific analysis
if success:
    analyze_lipid_maps_specifics(lipid_searcher)

## This comprehensive notebook provides:

📥 Database Loading: Handles LIPID MAPS SDF format with automatic property calculation




## 🎯 Enhanced Search Engine with All Methods


In [None]:
# ENHANCED LIPID MAPS SEARCH ENGINE
# =================================

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, AllChem
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

class EnhancedLipidSearchEngine:
    def __init__(self, df):
        self.df = df
        self._setup_search_columns()
    
    def _setup_search_columns(self):
        """Identify searchable columns in the database"""
        self.name_columns = [col for col in self.df.columns if any(term in str(col).lower() for term in 
                                ['name', 'id', 'title', 'abbreviation', 'common', 'systematic'])]
        
        self.category_columns = [col for col in self.df.columns if any(term in str(col).lower() for term in
                                  ['category', 'class', 'type', 'family', 'lm_id'])]
        
        self.property_columns = [col for col in self.df.columns if col in 
                                ['Molecular_Weight', 'LogP', 'TPSA', 'HBD', 'HBA'] or 
                                'weight' in str(col).lower() or 'mw' in str(col).lower()]
        
        print("🔍 Searchable columns identified:")
        print(f"   Name columns: {self.name_columns}")
        print(f"   Category columns: {self.category_columns}")
        print(f"   Property columns: {self.property_columns}")
    
    # 🔍 SEARCH BY NAME
    def search_by_name(self, search_term, case_sensitive=False):
        """Search lipids by name, ID, or abbreviation"""
        print(f"🔍 Searching for '{search_term}' in names...")
        
        matches = pd.DataFrame()
        for col in self.name_columns:
            try:
                if case_sensitive:
                    col_matches = self.df[self.df[col].astype(str).str.contains(search_term, na=False)]
                else:
                    col_matches = self.df[self.df[col].astype(str).str.contains(search_term, case=False, na=False)]
                
                matches = pd.concat([matches, col_matches]).drop_duplicates()
            except:
                continue
        
        print(f"✅ Found {len(matches)} matches")
        return matches
    
    # 🏷️ SEARCH BY CATEGORY
    def search_by_category(self, category_value, category_column=None, exact_match=True):
        """Search lipids by category/classification"""
        if category_column is None and self.category_columns:
            category_column = self.category_columns[0]
        elif not category_column:
            print("❌ No category columns available")
            return pd.DataFrame()
        
        print(f"🔍 Searching in '{category_column}' for '{category_value}'")
        
        try:
            if exact_match:
                results = self.df[self.df[category_column] == category_value]
            else:
                results = self.df[self.df[category_column].astype(str).str.contains(
                    str(category_value), case=False, na=False)]
            
            print(f"✅ Found {len(results)} lipids")
            return results
        except Exception as e:
            print(f"❌ Error in category search: {e}")
            return pd.DataFrame()
    
    # 📊 SEARCH BY PROPERTIES
    def search_by_properties(self, property_filters):
        """Search lipids by multiple property ranges"""
        print("🔍 Searching by property ranges...")
        
        mask = pd.Series([True] * len(self.df))
        
        for prop_filter in property_filters:
            prop = prop_filter['property']
            min_val = prop_filter.get('min')
            max_val = prop_filter.get('max')
            exact = prop_filter.get('exact')
            
            if prop not in self.df.columns:
                print(f"⚠️  Property '{prop}' not found")
                continue
            
            if exact is not None:
                prop_mask = self.df[prop] == exact
                print(f"   {prop} = {exact}: {prop_mask.sum()} matches")
            else:
                prop_mask = self.df[prop].between(min_val, max_val)
                print(f"   {prop}: {min_val} - {max_val}: {prop_mask.sum()} matches")
            
            mask = mask & prop_mask
        
        results = self.df[mask]
        print(f"✅ Found {len(results)} lipids matching all criteria")
        return results
    
    # 🧪 SEARCH BY STRUCTURE (SMARTS)
    def search_by_structure(self, smarts_pattern, max_results=100):
        """Search lipids by SMARTS pattern (structural search)"""
        print(f"🔍 Structural search with SMARTS: {smarts_pattern}")
        
        try:
            pattern = Chem.MolFromSmarts(smarts_pattern)
            if not pattern:
                print("❌ Invalid SMARTS pattern")
                return pd.DataFrame()
            
            matches = []
            count = 0
            for idx, row in self.df.iterrows():
                if count >= max_results:
                    break
                if 'Molecule' in row and row['Molecule'] and row['Molecule'].HasSubstructMatch(pattern):
                    matches.append(row)
                    count += 1
            
            results = pd.DataFrame(matches)
            print(f"✅ Found {len(results)} lipids matching SMARTS pattern")
            return results
            
        except Exception as e:
            print(f"❌ Error in SMARTS search: {e}")
            return pd.DataFrame()
    
    # 📈 SEARCH BY SIMILARITY
    def search_by_similarity(self, query_smiles, threshold=0.7, max_results=20):
        """Search for similar lipids using fingerprint similarity"""
        print(f"🔍 Similarity search for: {query_smiles}")
        
        try:
            query_mol = Chem.MolFromSmiles(query_smiles)
            if not query_mol:
                print("❌ Invalid SMILES string")
                return pd.DataFrame()
            
            # Generate fingerprints
            fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
            query_fp = fpgen.GetFingerprint(query_mol)
            
            similarities = []
            valid_indices = []
            
            for idx, row in self.df.iterrows():
                if 'Molecule' in row and row['Molecule']:
                    try:
                        mol_fp = fpgen.GetFingerprint(row['Molecule'])
                        similarity = DataStructs.TanimotoSimilarity(query_fp, mol_fp)
                        if similarity >= threshold:
                            similarities.append(similarity)
                            valid_indices.append(idx)
                    except:
                        continue
            
            if not valid_indices:
                print(f"❌ No lipids found with similarity >= {threshold}")
                return pd.DataFrame()
            
            # Create results dataframe
            results = self.df.loc[valid_indices].copy()
            results['Similarity'] = similarities
            results = results.sort_values('Similarity', ascending=False).head(max_results)
            
            print(f"✅ Found {len(results)} similar lipids (threshold: {threshold})")
            return results
            
        except Exception as e:
            print(f"❌ Error in similarity search: {e}")
            return pd.DataFrame()
    
    # 🔬 GET DATABASE STATISTICS
    def get_database_stats(self):
        """Get comprehensive database statistics"""
        stats = {
            'total_lipids': len(self.df),
            'categories_available': len(self.category_columns),
            'names_available': len(self.name_columns),
            'properties_available': len(self.property_columns)
        }
        
        # Property ranges
        for prop in self.property_columns:
            if prop in self.df.columns:
                stats[f'{prop}_min'] = self.df[prop].min()
                stats[f'{prop}_max'] = self.df[prop].max()
                stats[f'{prop}_mean'] = self.df[prop].mean()
        
        return stats

## 🎛️ Enhanced Interactive Interface


In [None]:
def enhanced_interactive_search(search_engine):
    """Enhanced interactive command-line interface for lipid searching"""
    
    df = search_engine.df
    
    print("\n🎛️  ENHANCED LIPID SEARCH INTERFACE")
    print("="*50)
    print("🔍 Multiple Search Methods Available:")
    print("   1. Name/ID Search")
    print("   2. Category/Classification Search") 
    print("   3. Property Range Search")
    print("   4. Structural Search (SMARTS)")
    print("   5. Similarity Search")
    print("   6. Database Statistics")
    print("   7. Visualize Results")
    print("   8. Export Results")
    print("   9. Exit")
    
    current_results = None
    
    while True:
        print("\n" + "─" * 50)
        choice = input("Choose search method (1-9): ").strip()
        
        if choice == '1':
            search_term = input("Enter name/ID to search: ").strip()
            current_results = search_engine.search_by_name(search_term)
            if len(current_results) > 0:
                display_enhanced_results(current_results)
        
        elif choice == '2':
            if search_engine.category_columns:
                print("Available category columns:")
                for i, col in enumerate(search_engine.category_columns, 1):
                    unique_vals = df[col].nunique()
                    print(f"   {i}. {col} ({unique_vals} categories)")
                
                col_choice = input("Choose category column (number): ").strip()
                if col_choice.isdigit() and 1 <= int(col_choice) <= len(search_engine.category_columns):
                    category_col = search_engine.category_columns[int(col_choice)-1]
                    
                    # Show top categories
                    top_cats = df[category_col].value_counts().head(10)
                    print("Top categories:")
                    for i, (cat, count) in enumerate(top_cats.items(), 1):
                        print(f"   {i}. {cat} ({count} lipids)")
                    
                    category_value = input("Enter category to search: ").strip()
                    current_results = search_engine.search_by_category(category_value, category_col)
                    if len(current_results) > 0:
                        display_enhanced_results(current_results)
            else:
                print("❌ No category columns available")
        
        elif choice == '3':
            print("Property search - available properties:")
            for i, prop in enumerate(search_engine.property_columns, 1):
                print(f"   {i}. {prop}")
            
            prop_filters = []
            while True:
                prop_choice = input("Select property (number) or 'done': ").strip()
                if prop_choice.lower() == 'done':
                    break
                if prop_choice.isdigit() and 1 <= int(prop_choice) <= len(search_engine.property_columns):
                    prop = search_engine.property_columns[int(prop_choice)-1]
                    
                    search_type = input("Search by (1) range or (2) exact value? ").strip()
                    if search_type == '1':
                        min_val = float(input(f"Minimum {prop}: "))
                        max_val = float(input(f"Maximum {prop}: "))
                        prop_filters.append({'property': prop, 'min': min_val, 'max': max_val})
                    else:
                        exact_val = float(input(f"Exact {prop}: "))
                        prop_filters.append({'property': prop, 'exact': exact_val})
            
            if prop_filters:
                current_results = search_engine.search_by_properties(prop_filters)
                if len(current_results) > 0:
                    display_enhanced_results(current_results)
        
        elif choice == '4':
            smarts = input("Enter SMARTS pattern: ").strip()
            max_results = int(input("Max results (default 100): ") or "100")
            current_results = search_engine.search_by_structure(smarts, max_results)
            if len(current_results) > 0:
                display_enhanced_results(current_results)
        
        elif choice == '5':
            smiles = input("Enter query SMILES: ").strip()
            threshold = float(input("Similarity threshold (0-1, default 0.7): ") or "0.7")
            max_results = int(input("Max results (default 20): ") or "20")
            current_results = search_engine.search_by_similarity(smiles, threshold, max_results)
            if len(current_results) > 0:
                display_enhanced_results(current_results)
        
        elif choice == '6':
            stats = search_engine.get_database_stats()
            print("\n📊 DATABASE STATISTICS:")
            for key, value in stats.items():
                print(f"   {key}: {value}")
        
        elif choice == '7':
            if current_results is not None and len(current_results) > 0:
                visualize_search_results(current_results)
            else:
                print("❌ No results to visualize. Perform a search first.")
        
        elif choice == '8':
            if current_results is not None and len(current_results) > 0:
                filename = input("Enter export filename (without extension): ").strip()
                if filename:
                    export_search_results(current_results, filename)
            else:
                print("❌ No results to export. Perform a search first.")
        
        elif choice == '9':
            print("👋 Exiting search interface")
            break
        
        else:
            print("❌ Invalid choice")

def display_enhanced_results(results, max_display=10):
    """Enhanced results display with molecular visualization"""
    print(f"\n📄 SEARCH RESULTS ({len(results)} found):")
    
    # Display key information
    display_cols = []
    for col in results.columns:
        if col != 'Molecule' and results[col].notna().any():
            display_cols.append(col)
    
    # Show table
    display(results[display_cols].head(max_display))
    
    # Show molecular structures
    if 'Molecule' in results.columns and results['Molecule'].notna().any():
        print("\n🖼️  MOLECULAR STRUCTURES:")
        molecules_to_show = results['Molecule'].head(6).tolist()
        legends = []
        
        for idx, row in results.head(6).iterrows():
            name = "Unknown"
            for name_col in ['ID', 'NAME', 'COMMON_NAME', 'ABBREVIATION']:
                if name_col in row and pd.notna(row[name_col]):
                    name = str(row[name_col])[:20]
                    break
            legends.append(name)
        
        try:
            img = Draw.MolsToGridImage(molecules_to_show, 
                                      molsPerRow=3, 
                                      subImgSize=(300, 300), 
                                      legends=legends)
            display(img)
        except Exception as e:
            print(f"⚠️  Could not display molecules: {e}")

## 📊 Enhanced Visualization System


In [None]:
def visualize_search_results(results):
    """Create comprehensive visualizations for search results"""
    
    if len(results) == 0:
        print("❌ No results to visualize")
        return
    
    print("📊 CREATING SEARCH RESULT VISUALIZATIONS...")
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Search Results Analysis', fontsize=16, fontweight='bold')
    
    # 1. Molecular Weight distribution
    if 'Molecular_Weight' in results.columns:
        axes[0,0].hist(results['Molecular_Weight'].dropna(), bins=20, alpha=0.7, 
                      color='skyblue', edgecolor='black')
        axes[0,0].set_xlabel('Molecular Weight')
        axes[0,0].set_ylabel('Frequency')
        axes[0,0].set_title('Molecular Weight Distribution')
        axes[0,0].grid(True, alpha=0.3)
    
    # 2. LogP distribution  
    if 'LogP' in results.columns:
        axes[0,1].hist(results['LogP'].dropna(), bins=20, alpha=0.7,
                      color='lightgreen', edgecolor='black')
        axes[0,1].set_xlabel('LogP')
        axes[0,1].set_ylabel('Frequency')
        axes[0,1].set_title('LogP Distribution')
        axes[0,1].grid(True, alpha=0.3)
    
    # 3. Property scatter plot
    if 'Molecular_Weight' in results.columns and 'LogP' in results.columns:
        scatter = axes[1,0].scatter(results['Molecular_Weight'], results['LogP'], 
                                   alpha=0.6, s=30)
        axes[1,0].set_xlabel('Molecular Weight')
        axes[1,0].set_ylabel('LogP')
        axes[1,0].set_title('Chemical Space (MW vs LogP)')
        axes[1,0].grid(True, alpha=0.3)
    
    # 4. Category distribution (if available)
    category_cols = [col for col in results.columns if any(term in str(col).lower() 
                     for term in ['category', 'class', 'type'])]
    if category_cols:
        category_col = category_cols[0]
        category_counts = results[category_col].value_counts().head(8)
        axes[1,1].bar(range(len(category_counts)), category_counts.values, 
                     color='gold', alpha=0.7)
        axes[1,1].set_xlabel('Category')
        axes[1,1].set_ylabel('Count')
        axes[1,1].set_title(f'Category Distribution')
        axes[1,1].set_xticks(range(len(category_counts)))
        axes[1,1].set_xticklabels([str(x)[:15] for x in category_counts.index], 
                                 rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()
    
    # Additional statistics
    print(f"\n📈 SEARCH RESULT STATISTICS:")
    print(f"   Total lipids found: {len(results)}")
    
    numeric_cols = ['Molecular_Weight', 'LogP', 'TPSA', 'HBD', 'HBA']
    for col in numeric_cols:
        if col in results.columns:
            data = results[col].dropna()
            if len(data) > 0:
                print(f"   {col}: {data.min():.1f} - {data.max():.1f} (avg: {data.mean():.1f})")

## 💾 Enhanced Export System


In [None]:
def export_search_results(results, filename_prefix):
    """Enhanced export functionality for search results"""
    
    if len(results) == 0:
        print("❌ No results to export")
        return
    
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Export to CSV
    csv_filename = f"{filename_prefix}_{timestamp}.csv"
    
    # Create exportable DataFrame (without molecule objects)
    export_df = results.drop('Molecule', axis=1, errors='ignore')
    
    # Reorder columns for better readability
    preferred_order = []
    for col_type in [['ID', 'NAME', 'COMMON_NAME', 'SYSTEMATIC_NAME'], 
                     ['Molecular_Weight', 'LogP', 'TPSA', 'HBD', 'HBA']]:
        for col in col_type:
            if col in export_df.columns:
                preferred_order.append(col)
    
    # Add remaining columns
    remaining_cols = [col for col in export_df.columns if col not in preferred_order]
    final_order = preferred_order + remaining_cols
    
    # Reorder if columns exist
    existing_cols = [col for col in final_order if col in export_df.columns]
    export_df = export_df[existing_cols]
    
    export_df.to_csv(csv_filename, index=False)
    print(f"💾 Results exported to CSV: {csv_filename}")
    
    # Export to SDF (if molecules available)
    if 'Molecule' in results.columns and results['Molecule'].notna().any():
        sdf_filename = f"{filename_prefix}_{timestamp}.sdf"
        writer = Chem.SDWriter(sdf_filename)
        
        saved_count = 0
        for idx, row in results.iterrows():
            if row['Molecule'] is not None:
                mol = row['Molecule']
                # Add all properties to the molecule
                for col in results.columns:
                    if col != 'Molecule' and pd.notna(row[col]):
                        try:
                            mol.SetProp(str(col), str(row[col]))
                        except:
                            pass
                writer.write(mol)
                saved_count += 1
        
        writer.close()
        print(f"💾 Molecular structures exported to SDF: {sdf_filename} ({saved_count} molecules)")
    
    # Summary report
    print(f"\n📋 EXPORT SUMMARY:")
    print(f"   Total results exported: {len(results)}")
    print(f"   Columns in CSV: {len(export_df.columns)}")
    print(f"   Molecules in SDF: {saved_count}")
    
    # Create a quick summary file
    summary_filename = f"{filename_prefix}_{timestamp}_summary.txt"
    with open(summary_filename, 'w') as f:
        f.write(f"LIPID MAPS Search Results Export\n")
        f.write(f"Generated: {datetime.datetime.now()}\n")
        f.write(f"Total lipids: {len(results)}\n")
        f.write(f"Files created:\n")
        f.write(f"  - {csv_filename} (CSV data)\n")
        f.write(f"  - {sdf_filename} (SDF structures)\n")
    
    print(f"💾 Summary report: {summary_filename}")

## 🚀 Complete Usage Example


In [None]:
# COMPLETE USAGE EXAMPLE
# ======================

# 1. Initialize the enhanced search engine
print("🚀 INITIALIZING ENHANCED LIPID SEARCH ENGINE...")
search_engine = EnhancedLipidSearchEngine(lipid_searcher.df)

# 2. Start interactive search interface
print("\n🎛️  STARTING INTERACTIVE SEARCH INTERFACE...")
enhanced_interactive_search(search_engine)

print("\n✅ Enhanced Lipid Search System Ready!")
print("🔍 Features available:")
print("   - Name/ID search across multiple columns")
print("   - Category/classification search") 
print("   - Property range search with multiple filters")
print("   - Structural search using SMARTS patterns")
print("   - Similarity search with fingerprint comparison")
print("   - Interactive visualization of results")
print("   - Multi-format export (CSV + SDF)")

🎯 Quick Search Examples to Try:
1. Search by Name/ID:
Try searching for: "PC" (phosphatidylcholines)

Or: "cholesterol"

Or specific LM_IDs like: "LMGP0101"

2. Search by Category:
Use the exact strings shown in the top categories list

Example: "Gangliosides [SP0601]" or "Triacylglycerols [GL0301]"

3. Search by Properties:
Find lipids with specific molecular weight ranges

Search for high LogP (hydrophobic) or low LogP (hydrophilic) compounds

4. Structural Searches:
Try SMARTS patterns like:

"C(=O)O" for carboxylic acids

"P(=O)(O)O" for phosphate groups

"c1ccccc1" for aromatic rings

5. Similarity Searches:
Use SMILES strings of known lipids to find similar compounds

📊 When You Get Results:
Use option 7 to visualize the chemical space of your results

Use option 8 to export your findings for further analysis

The system is now ready for comprehensive lipid exploration! Is there a specific type of lipid or search you'd like to try first?

