<a href="https://colab.research.google.com/github/Mohaammed-Fouad/Ligand-Based-Virtual-Screening/blob/main/VIRTUAL_SCREENING_SPRINT_SCRIPT_(SMILES).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# VIRTUAL SCREENING SPRINT SCRIPT (SMILES)
# SELECTION COUNT: 500
# OUTPUTS: .smi, .xlsx, .txt, PhysChem .xlsx, and Filtered .xlsx
# ==========================================================
# 1. INSTALLATION & SETUP

!pip install rdkit pandas tqdm openpyxl
from google.colab import drive, files
import pandas as pd
import io
import os
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw, Descriptors, rdMolDescriptors
from rdkit.ML.Cluster import Butina
from tqdm import tqdm

# Mount Google Drive for persistent backup
try:
    drive.mount('/content/drive')
    SAVE_TO_DRIVE = True
except:
    print("‚ö†Ô∏è Drive not mounted. Results will only be saved locally.")
    SAVE_TO_DRIVE = False

def calculate_physchem(mol):
    """Calculates comprehensive physicochemical properties."""
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = rdMolDescriptors.CalcNumHBD(mol)
    hba = rdMolDescriptors.CalcNumHBA(mol)
    tpsa = Descriptors.TPSA(mol)
    rb = rdMolDescriptors.CalcNumRotatableBonds(mol)

    # Lipinski's Rule of Five
    lipinski = (mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10)

    # Lead-like / Veber Criteria (Good oral bioavailability & permeability)
    lead_like = (lipinski and rb <= 10 and tpsa <= 140)

    return {
        'MW': round(mw, 2),
        'LogP': round(logp, 2),
        'HBD': hbd,
        'HBA': hba,
        'TPSA': round(tpsa, 2),
        'Rotatable_Bonds': rb,
        'Lipinski_Pass': lipinski,
        'Lead_Like_Pass': lead_like
    }

def load_uploaded_smi(prompt_text):
    """Triggers upload and processes the SMILES file."""
    print(f"\n--- {prompt_text} ---")
    uploaded = files.upload()
    if not uploaded:
        return None, None

    filename = list(uploaded.keys())[0]
    content = uploaded[filename]

    try:
        df = pd.read_csv(io.BytesIO(content), sep=None, engine='python')
        smiles_col = [c for c in df.columns if 'smiles' in c.lower()]
        smiles_col = smiles_col[0] if smiles_col else df.columns[0]

        mols = []
        for idx, row in df.iterrows():
            m = Chem.MolFromSmiles(str(row[smiles_col]))
            if m:
                name = str(row[df.columns[1]]) if len(df.columns) > 1 else f"ID_{idx}"
                m.SetProp("_Name", name)
                mols.append(m)

        print(f"‚úÖ Loaded {len(mols)} valid molecules.")
        return mols, filename
    except Exception as e:
        print(f"‚ùå Error parsing file: {e}")
        return None, None

# --- STEP 1: UPLOAD DATA ---
known_mols, _ = load_uploaded_smi("STEP 1: UPLOAD KNOWN ACTIVES")
if known_mols:
    known_fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=2048) for m in known_mols]
    blind_mols, _ = load_uploaded_smi("STEP 2: UPLOAD BLIND DATASET")

    if blind_mols:
        # --- STEP 2: SIMILARITY SEARCH ---
        print("\nSTEP 3: Running Tanimoto Similarity Search...")
        blind_results = []
        for m in tqdm(blind_mols):
            fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=2048)
            sims = DataStructs.BulkTanimotoSimilarity(fp, known_fps)
            max_s = max(sims)
            blind_results.append({
                'SMILES': Chem.MolToSmiles(m),
                'ID': m.GetProp("_Name"),
                'mol_obj': m,
                'fp': fp,
                'similarity': max_s
            })

        df = pd.DataFrame(blind_results)
        df = df.sort_values(by='similarity', ascending=False).reset_index(drop=True)

        # --- STEP 3: DIVERSITY CLUSTERING ---
        print("\nSTEP 4: Selecting 500 Diverse Candidates...")
        top_pool = df.head(1000).copy()
        fps_to_cluster = list(top_pool['fp'])

        dists = []
        nfps = len(fps_to_cluster)
        for i in range(1, nfps):
            sims = DataStructs.BulkTanimotoSimilarity(fps_to_cluster[i], fps_to_cluster[:i])
            dists.extend([1-x for x in sims])

        clusters = Butina.ClusterData(dists, nfps, 0.35, isDistData=True)
        selected_indices = []
        for cluster in clusters:
            selected_indices.append(cluster[0])
            if len(selected_indices) == 500: break

        if len(selected_indices) < 500:
            already_picked = set(selected_indices)
            for i in range(len(top_pool)):
                if i not in already_picked:
                    selected_indices.append(i)
                if len(selected_indices) == 500: break

        final_selection = top_pool.iloc[selected_indices].copy()

        # --- STEP 4: PHYSICOCHEMICAL ANALYSIS ---
        print("\nSTEP 5: Calculating Properties & Filtering Lead-Like Hits...")
        physchem_list = []
        for m in final_selection['mol_obj']:
            physchem_list.append(calculate_physchem(m))

        physchem_df = pd.DataFrame(physchem_list)
        extended_df = pd.concat([final_selection.reset_index(drop=True), physchem_df], axis=1)
        extended_df = extended_df.drop(columns=['mol_obj', 'fp'])

        # --- STEP 5: CREATE THE FOURTH FILTERED FILE ---
        # Filters for only those that pass Lead_Like criteria
        lead_hits_df = extended_df[extended_df['Lead_Like_Pass'] == True].copy()

        # --- STEP 6: EXPORT ALL FILES ---
        excel_fn = "final_500_selection.xlsx"
        physchem_fn = "final_500_physchem_full.xlsx"
        filtered_fn = "final_top_lead_like_hits.xlsx"
        txt_fn = "selection_summary_500.txt"

        # Export Excel Files
        final_selection[['SMILES', 'ID', 'similarity']].to_excel(excel_fn, index=False)
        extended_df.to_excel(physchem_fn, index=False)
        lead_hits_df.to_excel(filtered_fn, index=False)

        # Export Text Summary
        with open(txt_fn, 'w') as f:
            f.write("=== VIRTUAL SCREENING SUMMARY (TOP 500) ===\n")
            f.write(f"Total Initial Pool: {len(extended_df)}\n")
            f.write(f"Lead-Like Hits (Filtered): {len(lead_hits_df)}\n")
            f.write(f"Lipinski Pass Rate: {(extended_df['Lipinski_Pass'].sum()/500)*100:.1f}%\n\n")
            f.write(extended_df[['ID', 'similarity', 'MW', 'LogP', 'Lead_Like_Pass']].head(20).to_string(index=False))

        # Backup to Drive
        if SAVE_TO_DRIVE:
            for d, name in [(extended_df, physchem_fn), (lead_hits_df, filtered_fn)]:
                d.to_excel(f"/content/drive/MyDrive/{name}", index=False)

        # Download Files
        for f_name in [excel_fn, physchem_fn, filtered_fn, txt_fn]:
            files.download(f_name)

        print(f"\n‚úÖ PROCESS COMPLETE. Created 4 files.")
        print(f"üíé Filtered 'Lead-Like' Hits Found: {len(lead_hits_df)}")

        # Visual Summary
        img = Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in lead_hits_df['SMILES'].head(12)],
                                   molsPerRow=4, legends=list(lead_hits_df['ID'].head(12)))
        display(img)