# This script fetches kinase inhibitor data from the ChEMBL database and saves it as a CSV file

In [6]:
#Import Packages
import requests
import pandas as pd
from tqdm import tqdm
import os

In [None]:
def fetch_kinase_inhibitors(limit=300):
    """
    Fetch kinase inhibitor data from ChEMBL database
    """
    print("Starting kinase inhibitor data collection...")

    # Step 1: Search for kinase targets by keyword
    kinase_targets = []
    offset = 0

    while len(kinase_targets) < limit and offset < 2000:  # Limit search to prevent infinite loop
        target_url = f"https://www.ebi.ac.uk/chembl/api/data/target.json?limit=100&offset={offset}"

        try:
            print(f"Searching targets... offset: {offset}")
            response = requests.get(target_url)

            if response.status_code != 200:
                print(f"Error fetching targets: {response.status_code}")
                break

            data = response.json()
            targets = data.get('targets', [])

            if not targets:
                break

            # Filter for kinase targets
            for target in targets:
                pref_name = target.get('pref_name', '').lower()
                if 'kinase' in pref_name or 'protein kinase' in pref_name:
                    kinase_targets.append(target)

            offset += 100

        except Exception as e:
            print(f"Error at offset {offset}: {e}")
            break
    print(f"Found {len(kinase_targets)} kinase targets")

    # Step 2: Get activities for kinase targets
    molecule_records = []

    for i, target in enumerate(tqdm(kinase_targets, desc="Fetching activities")):  # Process all kinase targets
        target_chembl_id = target.get('target_chembl_id')
        if not target_chembl_id:
            continue

        act_offset = 0
        target_activities = 0

        while target_activities < 100:  # Limit activities per target
            activity_url = f"https://www.ebi.ac.uk/chembl/api/data/activity.json?target_chembl_id={target_chembl_id}&limit=100&offset={act_offset}"

            try:
                acts_response = requests.get(activity_url)
                if acts_response.status_code != 200:
                    break

                acts_data = acts_response.json()
                activities = acts_data.get("activities", [])

                if not activities:
                    break

                for act in activities:
                    if (act.get("molecule_chembl_id") and
                        act.get("standard_type") in ["IC50", "Ki", "Kd"] and
                        act.get("standard_value") is not None):

                        molecule_records.append({
                            "target_chembl_id": target_chembl_id,
                            "target_name": target.get('pref_name', 'Unknown'),
                            "molecule_chembl_id": act["molecule_chembl_id"],
                            "activity_type": act.get("standard_type"),
                            "activity_value": act.get("standard_value"),
                            "activity_units": act.get("standard_units")
                        })
                        target_activities += 1

                act_offset += 100

            except Exception as e:
                print(f"Error fetching activities for {target_chembl_id}: {e}")
                break

    # Step 3: Create DataFrame and remove duplicates
    if molecule_records:
        df = pd.DataFrame(molecule_records)
        df.drop_duplicates(subset=["molecule_chembl_id"], inplace=True)

        print(f"Retrieved {len(df)} unique kinase inhibitor records.")

        # Save to CSV
        os.makedirs("data", exist_ok=True)
        df.to_csv("data/step1_kinase_inhibitors_raw.csv", index=False)

        return df
    else:
        print("No kinase inhibitor data found.")
        return None

# Execute the function
if __name__ == "__main__":
    result = fetch_kinase_inhibitors(limit=300)

    if result is not None:
        print(f"\n Successfully retrieved {len(result)} kinase inhibitor records")
        print(f" Data saved to 'data/step1_kinase_inhibitors_raw.csv'")
    else:
        print("\n Failed to retrieve data")

Starting kinase inhibitor data collection...
Searching targets... offset: 0
Searching targets... offset: 100
Searching targets... offset: 100
Searching targets... offset: 200
Searching targets... offset: 200
Searching targets... offset: 300
Searching targets... offset: 300
Searching targets... offset: 400
Searching targets... offset: 400
Searching targets... offset: 500
Searching targets... offset: 500
Searching targets... offset: 600
Searching targets... offset: 600
Searching targets... offset: 700
Searching targets... offset: 700
Searching targets... offset: 800
Searching targets... offset: 800
Searching targets... offset: 900
Searching targets... offset: 900
Searching targets... offset: 1000
Searching targets... offset: 1000
Searching targets... offset: 1100
Searching targets... offset: 1100
Searching targets... offset: 1200
Searching targets... offset: 1200
Searching targets... offset: 1300
Searching targets... offset: 1300
Searching targets... offset: 1400
Searching targets... off

Fetching activities: 100%|██████████| 224/224 [10:23<00:00,  2.79s/it]

Retrieved 10587 unique kinase inhibitor records.

✓ Successfully retrieved 10587 kinase inhibitor records
✓ Data saved to 'data/step1_kinase_inhibitors_raw.csv'





In [None]:
# Check the results
if result is not None:
    print(f"Dataset shape: {result.shape}")
    print(f"Columns: {result.columns.tolist()}")
    print(f"\nFirst 5 rows:")
    print(result.head())
    
    print(f"\nActivity types distribution:")
    print(result['activity_type'].value_counts())
    
    print(f"\nTarget names (first 10):")
    print(result['target_name'].value_counts().head(10))
    
    print(f"\nActivity value statistics:")
    result['activity_value'] = pd.to_numeric(result['activity_value'], errors='coerce')
    print(result['activity_value'].describe())
    
    # Check if file was saved
    import os
    if os.path.exists("data/step1_kinase_inhibitors_raw.csv"):
        print(f"\n Data successfully saved to 'data/step1_kinase_inhibitors_raw.csv'")
    else:
        print(f"\n Data file not found")
else:
    print("No data retrieved.")

Dataset shape: (10587, 6)
Columns: ['target_chembl_id', 'target_name', 'molecule_chembl_id', 'activity_type', 'activity_value', 'activity_units']

First 5 rows:
  target_chembl_id                  target_name molecule_chembl_id  \
0       CHEMBL1862  Tyrosine-protein kinase ABL       CHEMBL281470   
1       CHEMBL1862  Tyrosine-protein kinase ABL        CHEMBL13462   
3       CHEMBL1862  Tyrosine-protein kinase ABL       CHEMBL414123   
5       CHEMBL1862  Tyrosine-protein kinase ABL       CHEMBL413629   
7       CHEMBL1862  Tyrosine-protein kinase ABL       CHEMBL301845   

  activity_type activity_value activity_units  
0          IC50       725000.0             nM  
1          IC50         4000.0             nM  
3          IC50        15000.0             nM  
5          IC50         1800.0             nM  
7          IC50        15000.0             nM  

Activity types distribution:
activity_type
IC50    8810
Ki      1290
Kd       487
Name: count, dtype: int64

Target names (first 