In [14]:
pip install chembl_webresource_client

Note: you may need to restart the kernel to use updated packages.


In [15]:
from chembl_webresource_client.new_client import new_client

In [16]:
import requests
import json
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [17]:
drugs = new_client.drug
#get the n° for step1.1
len(drugs)

15502

In [18]:
#ChEMBL ID name for step1.2
def get_chembl(drug: list):
    return drug['molecule_chembl_id']

get_chembl(drugs[0])

'CHEMBL2'

In [19]:
#year of first approval for step1.2
def get_FirstApproval(drug: list):
    return drug['first_approval']

get_FirstApproval(drugs[0])

1976

In [20]:
drugs[0]['molecule_synonyms']

[{'molecule_synonym': 'CP-12299',
  'syn_type': 'RESEARCH_CODE',
  'synonyms': 'CP-12299'},
 {'molecule_synonym': 'CP-122991',
  'syn_type': 'RESEARCH_CODE',
  'synonyms': 'CP-122991'},
 {'molecule_synonym': 'Prazosin', 'syn_type': 'FDA', 'synonyms': 'Prazosin'},
 {'molecule_synonym': 'Prazosin', 'syn_type': 'ATC', 'synonyms': 'PRAZOSIN'},
 {'molecule_synonym': 'Prazosin', 'syn_type': 'BAN', 'synonyms': 'PRAZOSIN'},
 {'molecule_synonym': 'Prazosin', 'syn_type': 'INN', 'synonyms': 'PRAZOSIN'},
 {'molecule_synonym': 'Prazosin',
  'syn_type': 'MERCK_INDEX',
  'synonyms': 'PRAZOSIN'},
 {'molecule_synonym': 'Prazosin', 'syn_type': 'OTHER', 'synonyms': 'PRAZOSIN'}]

In [21]:
#INN name to use if more than one value is available
def get_INN(drug: list):
    inn = [x['molecule_synonym'] for x in drug['molecule_synonyms'] if x['syn_type'] == 'INN']
    res = ';'.join(inn)
    return(res)

get_INN(drugs[0])

'Prazosin'

In [22]:
ids = [get_chembl(x) for x in drugs]
years = [get_FirstApproval(x) for x in drugs]
names = [get_INN(x) for x in drugs]

In [33]:
step1 = pd.DataFrame({'ID':ids, 'Name':names, 'FirstApproval':years})
step1.sort_values(by=['Name', 'FirstApproval'], axis=0, inplace=True)
print(step1)

                  ID               Name  FirstApproval
272         CHEMBL90                            1939.0
1079       CHEMBL821                            1939.0
2935       CHEMBL190                            1940.0
4656   CHEMBL1201649                            1942.0
4363   CHEMBL1201235                            1943.0
...              ...                ...            ...
1328     CHEMBL53904     Zuclopenthixol            NaN
12451  CHEMBL4105630         Zuranolone         2023.0
12919  CHEMBL4297637  Zuretinol acetate            NaN
14715  CHEMBL5095426     Zurletrectinib            NaN
12214  CHEMBL3989838       Zylofuramine            NaN

[15502 rows x 3 columns]


In [34]:
#Step2

In [35]:
approved_s2014_df = step1.query('FirstApproval >= 2014')\
                         .sort_values(by='FirstApproval', axis=0)
print(approved_s2014_df)

                  ID           Name  FirstApproval
2251    CHEMBL139367                        2014.0
8609   CHEMBL2107841    Albiglutide         2014.0
1060    CHEMBL441738  Afamelanotide         2014.0
6205   CHEMBL2103822    Tasimelteon         2014.0
10801  CHEMBL2216870     Idelalisib         2014.0
...              ...            ...            ...
11663  CHEMBL3643413     Leniolisib         2023.0
11914  CHEMBL3833321      Lecanemab         2023.0
10994  CHEMBL2397415     Zavegepant         2023.0
11927  CHEMBL3833346       Tofersen         2023.0
12451  CHEMBL4105630     Zuranolone         2023.0

[569 rows x 3 columns]


In [38]:
len(approved_s2014_df)

569

In [40]:
drugs_approved_s2024 = approved_s2014_df.ID[0:569]


comp2target = dict() #make empty dictionary
for d in drugs_approved_s2024:
    comp2target[d] = set() #dict will contain drug names as keys

# Process compounds in chunks so that we prevent huge queries
chunk_size = 50 #use larger chunk size
keys = list(comp2target.keys()) # keys (drug names) made in a list?

for i in range(0, len(keys), chunk_size):
    activities = new_client.activity.filter(molecule_chembl_id__in=keys[i:i + chunk_size]).only(
        ['molecule_chembl_id', 'target_chembl_id'])
    for act in activities:
        comp2target[act['molecule_chembl_id']].add(act['target_chembl_id']) #associates the target with the specific drug.

# We next translate the identifier of targets from ChEMBL IDs to UniProt IDs
for key, val in comp2target.items():
    lval = list(val)
    uniprots = set()
    for i in range(0, len(val), chunk_size):
        targets = new_client.target.filter(target_chembl_id__in=lval[i:i + chunk_size]).only(
            ['target_components'])
        uniprots |= {comp['accession'] for t in targets for comp in t['target_components']} 
    comp2target[key] = uniprots

KeyboardInterrupt: 

In [None]:
comp2target_df = pd.DataFrame([(i, uniprot) for i, j in comp2target.items() for uniprot in j], 
                  columns=['ID','UniProt'])
print(comp2target_df)
#for 569 drugs result is a 20205 rows dataframe -> 35.51 each

In [None]:
comp2target_df.to_csv('20230507-ChEMBL-drugs-since-2013-UniProt.tsv.gz', sep='\t')

In [None]:
#Step 3 For each protein with a UniProt accession number that you identified in step (2),retrieve UniProt keywords associated with it.

In [135]:
# Load the downloaded TSV file
#file_path = "keywords_all_2024_04_26.tsv.gz"
#df = pd.read_csv(file_path, sep="\t", compression="gzip")

In [30]:
step2_res = pd.read_csv('20230507-ChEMBL-drugs-since-2013-UniProt.tsv.gz', sep='\t', index_col=0)

In [31]:
uniq_uniprot = set(step2_res.UniProt)
sel_uniprot = list(uniq_uniprot)[:100]
sel_uniprot_queryid = ','.join(sel_uniprot) #'P24735,P30561,P16278,P34947,Q92633,P41145,O15146,Q14012,P15170,O60656,Q9HCR9,P0A3M1,P53667,Q8IWQ3,...

TypeError: sequence item 89: expected str instance, float found

In [25]:
## see UniProt API document here: https://www.ebi.ac.uk/proteins/api/doc/#!/proteins/search
url = 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession='+sel_uniprot_queryid
r = requests.get(url, headers={'Accept': 'application/json'})

NameError: name 'sel_uniprot_queryid' is not defined

In [26]:
## the keywords of the first protein can be found here
keywords = [list(k.values())[0] for k in r.json()[0]['keywords']]
print(keywords)

NameError: name 'r' is not defined

In [27]:
len(r.json()) #1201

NameError: name 'r' is not defined

In [28]:
## now let's try to fetch all keywords
def get_keywords(protein: list):
    res = [list(k.values())[0] for k in protein['keywords']]
    if res == None:
        res = []
    return(res)

In [156]:
all_keywords=[]
for jlist in r.json():
    all_keywords.extend(get_keywords(jlist))
    
keyword_counter = Counter(all_keywords)
print(keyword_counter)

KeyError: 'keywords'

In [None]:
most_common_keywords = keyword_counter.most_common(20)
y = [count for keyword, count in most_common_keywords]
x = [keyword for keyword, count in most_common_keywords]

plt.rcParams["figure.figsize"] = (10,7)
plt.rcParams.update({'font.size': 16})

plt.bar(x, y, color='lightblue')
plt.title("The most frequent keywords")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
for i, (tag, count) in enumerate(most_common_keywords):
    plt.text(i, count, f' {count} ', rotation=90, size=16,
             ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
plt.tight_layout() # change the whitespace such that all labels fit nicely
plt.show()