<a href="https://colab.research.google.com/github/Mohaammed-Fouad/Ligand-Based-Virtual-Screening/blob/main/Ligand_Based_Virtual_Screening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install rdkit scikit-learn pandas

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.svm import OneClassSVM

def load_mols_from_txt(file_path):
    """Parses TXT files containing SMILES and IDs."""
    mols = []
    ids = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                smiles, mol_id = parts[0], parts[1]
                mol = Chem.MolFromSmiles(smiles)
                if mol:
                    mols.append(mol)
                    ids.append(mol_id)
    return mols, ids

def get_fingerprints(mols):
    """Converts molecules to Morgan Fingerprints (ECFP4)."""
    return [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=2048) for m in mols]

# 1. Load the data
print("Loading files...")
active_mols, active_ids = load_mols_from_txt("/content/drive/My Drive/CADD_Test/Actives.txt")
blind_mols, blind_ids = load_mols_from_txt("/content/drive/My Drive/CADD_Test/Blind.txt")

# 2. Generate Fingerprints
active_fps = get_fingerprints(active_mols)
blind_fps = get_fingerprints(blind_mols)

# 3. Method A: Tanimoto Similarity Search
# For each blind compound, find the similarity to the MOST similar active compound.
print("Calculating Tanimoto Similarities...")
max_similarities = []
for b_fp in blind_fps:
    sims = DataStructs.BulkTanimotoSimilarity(b_fp, active_fps)
    max_similarities.append(max(sims))

# 4. Method B: One-Class SVM (Machine Learning)
# Trains a 'boundary' around your actives to spot outliers in the blind set.
print("Training One-Class SVM...")
X_train = np.array([list(fp) for fp in active_fps])
X_blind = np.array([list(fp) for fp in blind_fps])

# kernel='rbf' is great for chemical space; nu is the estimated % of outliers in training
clf = OneClassSVM(gamma='auto', kernel='rbf', nu=0.1)
clf.fit(X_train)

# Predict: 1 = Active-like, -1 = Outlier/Inactive-like
ml_predictions = clf.predict(X_blind)
# Score: Higher values indicate the molecule is deeper 'inside' the active cluster
ml_scores = clf.score_samples(X_blind)

# 5. Compile and Rank Results
results = pd.DataFrame({
    'ID': blind_ids,
    'SMILES': [Chem.MolToSmiles(m) for m in blind_mols],
    'Max_Tanimoto_Sim': max_similarities,
    'ML_Cluster_Score': ml_scores,
    'Is_Active_Like': [True if x == 1 else False for x in ml_predictions]
})

# Identify 'Golden Hits' (High similarity AND positive ML prediction)
# Generally, Tanimoto > 0.7 is considered highly similar
golden_hits = results[(results['Max_Tanimoto_Sim'] > 0.7) & (results['Is_Active_Like'] == True)]

# Save results
results.sort_values(by='Max_Tanimoto_Sim', ascending=False, inplace=True)
results.to_csv('Blind_Activity_Predictions.csv', index=False)

print(f"\nAnalysis Complete!")
print(f"Total blind compounds analyzed: {len(blind_ids)}")
print(f"Potential 'Golden Hits' found: {len(golden_hits)}")
print("Results saved to 'Blind_Activity_Predictions.csv'")

In [None]:
golden_hits.to_csv('Golden_Hits.csv', index=False)
print("Golden hits saved to 'Golden_Hits.csv'")

Golden hits saved to 'Golden_Hits.csv'


In [None]:
from rdkit import Chem

sdf_file = 'Golden_Hits.sdf'
writer = Chem.SDWriter(sdf_file)

for index, row in golden_hits.iterrows():
    smiles = row['SMILES']
    mol_id = row['ID']
    tanimoto_sim = row['Max_Tanimoto_Sim']
    ml_score = row['ML_Cluster_Score']
    is_active_like = row['Is_Active_Like']

    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        mol.SetProp('_Name', mol_id)
        mol.SetProp('Max_Tanimoto_Sim', str(tanimoto_sim))
        mol.SetProp('ML_Cluster_Score', str(ml_score))
        mol.SetProp('Is_Active_Like', str(is_active_like))
        writer.write(mol)
writer.close()

print(f"Golden hits successfully converted and saved to '{sdf_file}'")