In [None]:
#From your Step 1 file (step1_kinase_inhibitors_raw.csv), you already have:
#target_chembl_ids for kinase targets
#Now we want to:
# 1. Map each target_chembl_id to its UniProt accession or protein sequence
# 2. Get the FASTA sequence for that target
# 3. Save it into a usable .csv or .fasta file

In [2]:
#Import Packages
import requests
import pandas as pd
import time
from tqdm import tqdm
import os

In [7]:
# Function to fetch FASTA sequences using UniProt
def fetch_sequence_from_uniprot(accession):
    """Fetch FASTA sequence from UniProt given an accession number"""
    url = f"https://www.uniprot.org/uniprot/{accession}.fasta"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            lines = response.text.strip().split('\n')
            if len(lines) > 1:
                sequence = ''.join(lines[1:])  # Skip header line
                return sequence
    except:
        pass
    return None


def fetch_kinase_fasta_sequences(target_ids, target_name_dict=None, delay=0.5):
    """
    Fetch FASTA protein sequences for kinase targets using UniProt.
    Gets UniProt accessions from ChEMBL, then fetches sequences from UniProt directly.
    """
    base_url = "https://www.ebi.ac.uk/chembl/api/data/target/{}"
    headers = {"Accept": "application/json"}
    results = []
    
    print(f"Fetching sequences via UniProt for {len(target_ids)} targets...")
    successful_fetches = 0

    for i, target_id in enumerate(tqdm(target_ids, desc="Fetching via UniProt")):
        try:
            # Get target info from ChEMBL
            r = requests.get(base_url.format(target_id), headers=headers, timeout=10)
            if r.status_code != 200:
                continue

            target_data = r.json()
            components = target_data.get("target_components", [])
            
            for comp in components:
                accession = comp.get("accession")
                if accession and accession != "UNKNOWN":
                    # Fetch sequence from UniProt
                    sequence = fetch_sequence_from_uniprot(accession)
                    if sequence and len(sequence) > 50:
                        # Get target name from the mapping
                        target_name = target_name_dict.get(target_id, "Unknown") if target_name_dict else "Unknown"
                        
                        results.append({
                            "target_chembl_id": target_id,
                            "target_name": target_name,
                            "uniprot_accession": accession,
                            "fasta_sequence": sequence,
                            "sequence_length": len(sequence)
                        })
                        successful_fetches += 1
                        break
                        
            time.sleep(delay)
            
        except Exception as e:
            continue

    print(f"✓ Successfully fetched {successful_fetches} sequences via UniProt")
    return pd.DataFrame(results)


if __name__ == "__main__":
    os.makedirs("data", exist_ok=True)

    # Load unique kinase targets from step1 output
    print("Loading kinase targets from step1 data...")
    df = pd.read_csv("data/step1_kinase_inhibitors_raw.csv")
    
    # Create a mapping of target_chembl_id to target_name
    target_mapping = df[['target_chembl_id', 'target_name']].drop_duplicates()
    target_name_dict = dict(zip(target_mapping['target_chembl_id'], target_mapping['target_name']))
    
    target_ids = df["target_chembl_id"].unique()
    print(f"Unique target_chembl_ids: {len(target_ids)}")
    print(f"Example IDs: {target_ids[:5]}")
    print(f"Example names: {[target_name_dict.get(tid, 'Unknown') for tid in target_ids[:5]]}")

    # Fetch FASTA sequences using UniProt
    print("\nFetching FASTA sequences via UniProt...")
    fasta_df = fetch_kinase_fasta_sequences(target_ids, target_name_dict)

    if len(fasta_df) > 0:
        # Save results
        fasta_df.to_csv("data/step3_kinase_target_fasta.csv", index=False)
        print(f"\n✓ Saved {len(fasta_df)} sequences to 'data/step3_kinase_target_fasta.csv'")

        # Save to FASTA format
        with open("data/step3_kinase_target_sequences.fasta", "w") as f:
            for _, row in fasta_df.iterrows():
                f.write(f">{row['uniprot_accession']}|{row['target_chembl_id']}|{row['target_name']}|Length:{row['sequence_length']}\n")
                f.write(f"{row['fasta_sequence']}\n")

        print("✓ FASTA format saved to 'data/step3_kinase_target_sequences.fasta'")
        
        # Show results summary
        print(f"\nResults summary:")
        print(f"Total sequences: {len(fasta_df)}")
        print(f"Average sequence length: {fasta_df['sequence_length'].mean():.0f}")
        print(f"Sequence length range: {fasta_df['sequence_length'].min()} - {fasta_df['sequence_length'].max()}")
        print(f"\nFirst 5 targets:")
        print(fasta_df[['target_chembl_id', 'target_name', 'uniprot_accession', 'sequence_length']].head())
        
    else:
        print("\n✗ No sequences found.")


Loading kinase targets from step1 data...
Unique target_chembl_ids: 188
Example IDs: ['CHEMBL1862' 'CHEMBL1824' 'CHEMBL1820' 'CHEMBL3629' 'CHEMBL2160']
Example names: ['Tyrosine-protein kinase ABL', 'Receptor protein-tyrosine kinase erbB-2', 'Thymidine kinase', 'Casein kinase II alpha', 'Nucleoside diphosphate kinase 2']

Fetching FASTA sequences via UniProt...
Fetching sequences via UniProt for 188 targets...


Fetching via UniProt: 100%|██████████| 188/188 [07:02<00:00,  2.25s/it]

✓ Successfully fetched 188 sequences via UniProt

✓ Saved 188 sequences to 'data/step3_kinase_target_fasta.csv'
✓ FASTA format saved to 'data/step3_kinase_target_sequences.fasta'

Results summary:
Total sequences: 188
Average sequence length: 637
Sequence length range: 152 - 4128

First 5 targets:
  target_chembl_id                              target_name uniprot_accession  \
0       CHEMBL1862              Tyrosine-protein kinase ABL            P00519   
1       CHEMBL1824  Receptor protein-tyrosine kinase erbB-2            P04626   
2       CHEMBL1820                         Thymidine kinase            P06479   
3       CHEMBL3629                   Casein kinase II alpha            P68400   
4       CHEMBL2160          Nucleoside diphosphate kinase 2            P22392   

   sequence_length  
0             1130  
1             1255  
2              376  
3              391  
4              152  



