In [4]:
#Here is a comment for installing chembl_webresource_client
pip install chembl_webresource_client

Note: you may need to restart the kernel to use updated packages.


In [2]:
#importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors,Lipinski,rdchem

In [3]:
#searching and selecting  as the drug target
target_query = new_client.target.search("Erbb1")
targets = pd.DataFrame(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P00533', 'xref_name': None, 'xre...",Homo sapiens,Epidermal growth factor receptor erbB1,15.0,False,CHEMBL203,"[{'accession': 'P00533', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Protein cereblon/Epidermal growth factor receptor,14.0,False,CHEMBL4523680,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
2,[],Homo sapiens,MER intracellular domain/EGFR extracellular do...,13.0,False,CHEMBL3137284,"[{'accession': 'P00533', 'component_descriptio...",CHIMERIC PROTEIN,9606
3,[],Homo sapiens,EGFR/PPP1CA,13.0,False,CHEMBL4523747,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
4,[],Homo sapiens,VHL/EGFR,13.0,False,CHEMBL4523998,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
5,"[{'xref_id': 'Q01279', 'xref_name': None, 'xre...",Mus musculus,Epidermal growth factor receptor erbB1,12.0,False,CHEMBL3608,"[{'accession': 'Q01279', 'component_descriptio...",SINGLE PROTEIN,10090
6,[],Homo sapiens,Epidermal growth factor receptor and ErbB2 (HE...,10.0,False,CHEMBL2111431,"[{'accession': 'P04626', 'component_descriptio...",PROTEIN FAMILY,9606
7,[],Homo sapiens,Baculoviral IAP repeat-containing protein 2/Ep...,10.0,False,CHEMBL4802031,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
8,[],Homo sapiens,Epidermal growth factor receptor,7.0,False,CHEMBL2363049,"[{'accession': 'P04626', 'component_descriptio...",PROTEIN FAMILY,9606


In [4]:
selected_query = targets.target_chembl_id[0]
selected_query

'CHEMBL203'

In [5]:
#selecting the desired data set
activity = new_client.activity
erbb1_ic50 = activity.filter(target_chembl_id = selected_query).filter(standard_type = "IC50")
erbb1_df = pd.DataFrame(erbb1_ic50)

In [82]:
#saving data set as csv
erbb1_df.to_csv('raw_data_erbb1_ic50.csv', index = False)

In [7]:
#removing any missing values in the "standard value" column
erbb1_df_naremove = erbb1_df[erbb1_df.standard_value.notna()]
erbb1_df_naremove.shape

(15336, 46)

In [58]:
#removing missing cananonical smiles
erbb1_df_naremove_smiles = erbb1_df_naremove[erbb1_df_naremove.canonical_smiles.notna()]
erbb1_df_naremove_smiles.shape

(15329, 46)

In [40]:
#Getting a list of columns from the erbb1_df
erbb1_df_naremove_smiles.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [59]:
erbb1_df_naremove_smiles['data_validity_comment'].value_counts()

Outside typical range            314
Potential transcription error     53
Name: data_validity_comment, dtype: int64

In [65]:
#selecting desired columns
selected_columns = ['canonical_smiles', 'molecule_chembl_id','bao_label', 'standard_units', 'standard_value','data_validity_comment']
erbb1_df_select_columns = erbb1_df_naremove_smiles[selected_columns]
erbb1_df_select_columns.head()


Unnamed: 0,canonical_smiles,molecule_chembl_id,bao_label,standard_units,standard_value,data_validity_comment
0,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,CHEMBL68920,single protein format,nM,41.0,
1,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,CHEMBL68920,cell-based format,nM,300.0,
2,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,CHEMBL68920,cell-based format,nM,7820.0,
3,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,CHEMBL69960,single protein format,nM,170.0,
4,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,CHEMBL69960,cell-based format,nM,40.0,


In [66]:
#checking out the out of range values
erbb1_df_err = erbb1_df_select_columns[erbb1_df_select_columns['data_validity_comment'] == 'Outside typical range' ]
erbb1_df_err.head()

Unnamed: 0,canonical_smiles,molecule_chembl_id,bao_label,standard_units,standard_value,data_validity_comment
7,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,CHEMBL306988,single protein format,nM,500000.0,Outside typical range
8,O=C(O)/C=C/c1ccc(O)cc1,CHEMBL66879,single protein format,nM,3000000.0,Outside typical range
11,COc1cc(/C=C(\C#N)C(=O)O)cc(OC)c1O,CHEMBL76979,single protein format,nM,264000.0,Outside typical range
13,COc1cc(C=C(C#N)C#N)cc(OC)c1,CHEMBL77724,single protein format,nM,1250000.0,Outside typical range
14,N#CC(C#N)=Cc1ccc(N2CCOCC2)cc1,CHEMBL420385,single protein format,nM,625000.0,Outside typical range


In [96]:
#checking the max and min for the standard value of erbb1_df_err
erbb1_df_err['standard_value'] = pd.to_numeric(erbb1_df_err['standard_value'])
print(erbb1_df_err['standard_value'].max())
print(erbb1_df_err['standard_value'].min())

55000000.0
5.012e-09


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  erbb1_df_err['standard_value'] = pd.to_numeric(erbb1_df_err['standard_value'])


In [68]:
#filtering for data_validity_comment = None
erbb1_df_err_rm = erbb1_df_select_columns[erbb1_df_select_columns['data_validity_comment'].isnull()]
erbb1_df_err_rm.shape

(14962, 6)

In [101]:
#looking at range of standard_values
erbb1_df_err_rm['standard_value'] = pd.to_numeric(erbb1_df_err_rm['standard_value'])
print(erbb1_df_err_rm['standard_value'].max())
print(erbb1_df_err_rm['standard_value'].min())

100000.0
0.004


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  erbb1_df_err_rm['standard_value'] = pd.to_numeric(erbb1_df_err_rm['standard_value'])


In [102]:
#looking at the bao labels (experimental design) and looking at the number of observations for each bao label
erbb1_df_err_rm['bao_label'].unique()
erbb1_df_err_rm['bao_label'].value_counts()

single protein format    7131
cell-based format        4347
assay format             2866
protein format            537
cell membrane format       42
cell-free format           39
Name: bao_label, dtype: int64

In [103]:
#Selecting 'single protein format' boa labels
erbb1_df_spf = erbb1_df_err_rm[erbb1_df_err_rm['bao_label'] == 'single protein format']
erbb1_df_spf.shape

(7131, 6)

In [104]:
#Removing duplicate compounds (molecular_chembl_id)
erbb1_spf_noduplicates_df = erbb1_df_spf.drop_duplicates(subset=['canonical_smiles'])
erbb1_spf_noduplicates_df.shape

(5385, 6)

In [105]:
#Removing duplicate compounds (canonical SMILES)
erbb1_spf_noduplicates_df = erbb1_spf_noduplicates_df.drop_duplicates(subset=['canonical_smiles'])
erbb1_spf_noduplicates_df.shape

(5385, 6)

In [106]:
#converting 'standard_value' column to numeric
erbb1_spf_noduplicates_df['standard_value']

0           41.00
3          170.00
6         9300.00
9        96000.00
10        5310.00
           ...   
16696        3.10
16706        2.00
16710       20.00
16711       10.00
16712        0.47
Name: standard_value, Length: 5385, dtype: float64

In [107]:
#creating a function to take the -log of the molar value of nM
import math

#function to convert the nM of a compound to the -log10(m)
def logm(nm):
    m  = nm/1000000000
    m = -math.log10(m)
    return m
#creating a new column called '-log(M)' which contains the -log(M) of the 'standard_value' column
erbb1_spf_noduplicates_df['-log(M)'] = erbb1_spf_noduplicates_df['standard_value'].apply(logm)

In [108]:
#check to make sure we don't have NA's in our columns
erbb1_spf_noduplicates_df.isna().sum()

canonical_smiles            0
molecule_chembl_id          0
bao_label                   0
standard_units              0
standard_value              0
data_validity_comment    5385
-log(M)                     0
dtype: int64

In [109]:
#subsetting for canonical_smiles and -log(M)
final_df = erbb1_spf_noduplicates_df[['canonical_smiles', '-log(M)']]
final_df

Unnamed: 0,canonical_smiles,-log(M)
0,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,7.387216
3,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,6.769551
6,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,5.031517
9,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],4.017729
10,Cc1cc(C(=O)NCCN2CCOCC2)[nH]c1/C=C1\C(=O)N(C)c2...,5.274905
...,...,...
16696,CC(=O)N[C@H]1CC[C@@H](N2C(=O)N(c3ccccc3Cl)Cc3c...,8.508638
16706,Cc1ccc2c(-c3ccccc3)nc(=O)n(C(C)C)c2c1,8.698970
16710,COc1cc2ncnc(Oc3cccc(NC(=S)Nc4ccc(Cl)c(C(F)(F)F...,7.698970
16711,COc1cc2ncnc(Sc3cccc(NC(=S)Nc4ccc(Br)c(C(F)(F)F...,8.000000
