In [None]:
from chembl_webresource_client.new_client import new_client
from chembl_webresource_client.settings import Settings
import pandas as pd

# ── SETUP & SAFETY NETS ────────────────────────────────────────────────
Settings.Instance().CACHING = False
Settings.Instance().TIMEOUT = 120 # Added to prevent zombie processes!

activity = new_client.activity
molecule = new_client.molecule
mechanism = new_client.mechanism

target_real = 'CHEMBL287'

# ── STEP 1: Get all quantitative binding records ──────────────────────────
print("1. Fetching quantitative binding records...")
activities = activity.filter(
    target_chembl_id = target_real,
    standard_type__in = ['Ki', 'IC50'],
    pchembl_value__isnull = False
).only([
    'molecule_chembl_id', 'molecule_pref_name', 'parent_molecule_chembl_id',
    'pchembl_value', 'standard_type', 'standard_value', 'standard_units'
])

activities_df = pd.DataFrame(list(activities))
activities_df['pchembl_value'] = pd.to_numeric(activities_df['pchembl_value'], errors='coerce')
activities_df.to_csv('gemini_activities3.csv', index=False)
print(f"Total activity records: {len(activities_df)}")

# ── STEP 2: Get unique parent IDs and their mean pChEMBL ─────────────────
print("2. Aggregating by Parent ID...")
parent_ids = activities_df['parent_molecule_chembl_id'].dropna().unique().tolist()

agg_df = (
    activities_df
    .groupby('parent_molecule_chembl_id')
    .agg(
        mean_pchembl = ('pchembl_value', 'mean'),
        best_pchembl = ('pchembl_value', 'max'),
        assay_count  = ('pchembl_value', 'count')
    )
    .reset_index()
    .round({'mean_pchembl': 2, 'best_pchembl': 2})
)

# ── STEP 3: Fetch molecule metadata (Phase, name) ─────────────────────────
print("3. Fetching molecule metadata in batches...")
all_mols   = []
batch_size = 100

for i in range(0, len(parent_ids), batch_size):
    batch  = parent_ids[i : i + batch_size]
    result = molecule.filter(molecule_chembl_id__in=batch).only(['molecule_chembl_id', 'max_phase', 'pref_name'])
    all_mols.extend(list(result))

mol_df = pd.DataFrame(all_mols)
mol_df.to_csv('gemini_molecule3.csv', index=False)


# ── STEP 4: Fetch mechanism labels (THE FIX!) ───────────────────────────
print("4. Fetching mechanism labels in batches...")
all_mec = []

for i in range(0, len(parent_ids), batch_size):
    batch  = parent_ids[i : i + batch_size]
    # FIX: Do not ask for parent_molecule_chembl_id here. It doesn't exist!
    result = mechanism.filter(molecule_chembl_id__in=batch).only(['molecule_chembl_id', 'action_type'])
    all_mec.extend(list(result))

mec_df = pd.DataFrame(all_mec)

if mec_df.empty:
    mec_df = pd.DataFrame(columns=['molecule_chembl_id', 'action_type'])

# FIX: Rename molecule_chembl_id to parent_molecule_chembl_id so the merge works!
mec_df = mec_df.rename(columns={'molecule_chembl_id': 'parent_molecule_chembl_id'})

mec_clean = (
    mec_df[['parent_molecule_chembl_id', 'action_type']]
    .drop_duplicates(subset='parent_molecule_chembl_id')
)

mec_df.to_csv('gemini_mechanism3.csv', index=False)
print(f"Mechanism records fetched: {len(mec_clean)}")

# ── STEP 5: Merge everything together ─────────────────────────────────────
print("5. Merging datasets...")
merged_df = (
    agg_df
    .merge(
        mol_df.rename(columns={'molecule_chembl_id': 'parent_molecule_chembl_id'}),
        on='parent_molecule_chembl_id', how='left'
    )
    .merge(
        mec_clean,
        on='parent_molecule_chembl_id', how='left'
    )
)

merged_df['max_phase']   = pd.to_numeric(merged_df['max_phase'], errors='coerce')
merged_df['action_type'] = merged_df['action_type'].str.upper().str.strip()

# ── STEP 6: Filter to AGONISTS and MODULATORS only ───────────────────────
agonist_types = ['AGONIST', 'MODULATOR', 'POSITIVE ALLOSTERIC MODULATOR', 'PARTIAL AGONIST']
agonists_df = merged_df[merged_df['action_type'].isin(agonist_types)].copy()

# ── STEP 7: Filter to Phase 4 only ───────────────────────────────────────
phase4_agonists = (
    agonists_df[agonists_df['max_phase'] == 4]
    .sort_values('mean_pchembl', ascending=False)
    .reset_index(drop=True)
)

phase4_agonists['selection_basis'] = 'Identified via ChEMBL quantitative binding pipeline'
phase4_agonists['key_reference']   = 'ChEMBL Database'

# ── KNOWLEDGE-GUIDED CANDIDATES ───────────────────────────────────────────
print("6. Injecting Knowledge-Guided Candidates...")
knowledge_guided = pd.DataFrame([
    {
        'pref_name'                : 'PENTAZOCINE',
        'parent_molecule_chembl_id': 'CHEMBL60542',
        'max_phase'               : 4,
        'action_type'             : 'AGONIST',
        'selection_basis'         : 'Prototypical sigma-1R reference agonist; human crystal structure solved (PDB: 6DK1)',
        'key_reference'           : 'Huang et al., 2017, Cell'
    },
    {
        'pref_name'                : 'DEXTROMETHORPHAN',
        'parent_molecule_chembl_id': 'CHEMBL52440',
        'max_phase'               : 4,
        'action_type'             : 'AGONIST',
        'selection_basis'         : 'Sigma-1R agonism mediates neuroprotection',
        'key_reference'           : 'Nguyen et al., 2014, Trends Pharmacol Sci'
    },
    {
        'pref_name'                : 'DONEPEZIL',
        'parent_molecule_chembl_id': 'CHEMBL502',
        'max_phase'               : 4,
        'action_type'             : 'AGONIST',
        'selection_basis'         : 'IC50=14.6nM at sigma-1R; 93% receptor occupancy',
        'key_reference'           : 'Meunier et al., 2006, Br J Pharmacol'
    },
    {
        'pref_name'                : 'FLUVOXAMINE',
        'parent_molecule_chembl_id': 'CHEMBL814',
        'max_phase'               : 4,
        'action_type'             : 'AGONIST',
        'selection_basis'         : 'Highest sigma-1R affinity among all SSRIs (Ki=36nM)',
        'key_reference'           : 'Hashimoto et al., 2009, Biol Psychiatry'
    }
])

pchembl_lookup = agg_df.set_index('parent_molecule_chembl_id')[['mean_pchembl', 'best_pchembl', 'assay_count']]
knowledge_guided = knowledge_guided.merge(pchembl_lookup, on='parent_molecule_chembl_id', how='left')

# ── THE GRAND MERGE & ELIMINATION ─────────────────────────────────────────
print("7. Generating Final Master List & Extracting Top 5 Docking Candidates...")
master_df = pd.concat([knowledge_guided, phase4_agonists], ignore_index=True)
master_df = master_df.drop_duplicates(subset='parent_molecule_chembl_id', keep='first')
master_df = master_df.sort_values('mean_pchembl', ascending=False).reset_index(drop=True)

master_df['source'] = master_df['key_reference'].apply(
    lambda x: 'Knowledge Guided' if 'et al' in str(x) else 'Pipeline Derived'
)
master_df.to_csv('Isoradix_Master_Candidates.csv', index=False)

# Isolate the Final 5, explicitly removing Pentazocine so it can be the isolated positive control
final_5 = master_df[
    (master_df['mean_pchembl'] >= 6) &
    (~master_df['pref_name'].isin(['FENTANYL', 'BREXPIPRAZOLE', 'TESTOSTERONE', 'LASMIDITAN', 'RAMELTEON'])) &
    (~master_df['pref_name'].isin(['PENTAZOCINE', '(+)-PENTAZOCINE']))
].sort_values('mean_pchembl', ascending=False).head(6).reset_index(drop=True)

print("\n" + "="*60)
print("FINAL 5 DOCKING CANDIDATES")
print("="*60)
print(final_5[['pref_name', 'mean_pchembl', 'action_type', 'source']].to_string(index=False))

final_5.to_csv('Isoradix_Final_5_Docking.csv', index=False)
print("\n✅ Pipeline Complete! 'Isoradix_Final_5_Docking.csv' saved.")

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


1. Fetching quantitative binding records...
