<a href="https://colab.research.google.com/github/KarimeZeraik/QSAR-and-ML/blob/main/QSAR_Trypanosoma_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PROJECT:** Drug repurposing for trypanosomiasis.

MSc. Karime Z. A. Domingues (UFPR)

***Code by Alexandre  de F. Cobre*** [Github](https://github.com/AlexandreCOBRE/code)





# Performing the necessary installations/uninstallations:

In [None]:
!pip install fastapi kaleido python-multipart uvicorn
!pip install chembl_webresource_client


Collecting fastapi
  Downloading fastapi-0.111.1-py3-none-any.whl.metadata (26 kB)
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting uvicorn
  Downloading uvicorn-0.30.4-py3-none-any.whl.metadata (6.6 kB)
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl.metadata (5.9 kB)
Collecting fastapi-cli>=0.0.2 (from fastapi)
  Downloading fastapi_cli-0.0.4-py3-none-any.whl.metadata (7.0 kB)
Collecting httpx>=0.23.0 (from fastapi)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting email_validator>=2.0.0 (from fastapi)
  Downloading email_validator-2.2.0-py3-none-any.whl.metadata (25 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting dnspython>=2.0.0 (from email_validator>=2.0.0->fastapi)
  Downloading dnspytho

# Importing necessary libraries:

In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

#**DATASET SELECTION:**

ChEMBL database (https://www.ebi.ac.uk/chembl/).

### Searching for target datasets: "*Trypanosoma*"

In [None]:
alvo = new_client.target
pesquisa_alvo = alvo.search('Trypanosoma')
ds = pd.DataFrame.from_dict(pesquisa_alvo)
ds

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Trypanosoma,Trypanosoma,16.0,True,CHEMBL612883,[],ORGANISM,5690
1,[],Trypanosoma brucei,Trypanosoma brucei,14.0,False,CHEMBL612849,[],ORGANISM,5691
2,[],Trypanosoma cruzi,Trypanosoma cruzi,14.0,False,CHEMBL368,[],ORGANISM,5693
3,[],Trypanosoma evansi,Trypanosoma evansi,14.0,False,CHEMBL1075362,[],ORGANISM,5697
4,[],Trypanosoma lewisi,Trypanosoma lewisi,14.0,False,CHEMBL2366781,[],ORGANISM,5695
...,...,...,...,...,...,...,...,...,...
72,[],Trypanosoma cruzi (strain CL Brener),"Tyrosyl-tRNA synthetase, putative",7.0,False,CHEMBL4662926,"[{'accession': 'Q4DSP6', 'component_descriptio...",SINGLE PROTEIN,353153
73,[],Trypanosoma cruzi,Proteasome subunit beta,7.0,False,CHEMBL4662930,"[{'accession': 'O96673', 'component_descriptio...",SINGLE PROTEIN,5693
74,[],Trypanosoma brucei equiperdum,Cleavage and polyadenylation specificity facto...,7.0,False,CHEMBL4665581,"[{'accession': 'A0A3L6L9G6', 'component_descri...",SINGLE PROTEIN,630700
75,[],Trypanosoma congolense,Trypanothione reductase,7.0,False,CHEMBL5291562,"[{'accession': 'P13110', 'component_descriptio...",SINGLE PROTEIN,5692


### Searching the dataset for a specific target:

In [None]:
# Listing all possible targets ("alvos") for visualization
lista_alvos = ds['pref_name'].tolist()
print(lista_alvos)

['Trypanosoma', 'Trypanosoma brucei', 'Trypanosoma cruzi', 'Trypanosoma evansi', 'Trypanosoma lewisi', 'Trypanosoma congolense', 'Trypanosoma vivax', 'Cell membrane', 'Trypanosoma brucei brucei', 'Trypanosoma brucei gambiense', 'Trypanosoma brucei rhodesiense', 'Trypanothione reductase', 'Trypanothione reductase', 'Cruzipain', 'Adenosine transporter 1', 'Hypoxanthine-guanine phosphoribosyltransferase', 'Rhodesain', 'Nucleoside transporter 2', 'Vacuolar-type proton translocating pyrophosphatase 1', 'Hexokinase', 'Farnesyl diphosphate synthase', 'Fructose-bisphosphate aldoloase, glycosomal', 'Deoxyuridine triphosphatase', 'Trans-sialidase', 'Trypanothione reductase', 'Farnesyltransferase, putative', 'Pteridine reductase, putative', 'Dihydrofolate reductase-thymidylate synthase', '6-phospho-1-fructokinase', 'Glyceraldehyde-3-phosphate dehydrogenase, glycosomal', 'IAG-nucleoside hydrolase', 'Glucose-6-phosphate 1-dehydrogenase', 'Cathepsin B-like cysteine protease', 'Farnesyl synthetase, p

In [None]:
# Defining the target to be searched
alvo = "Trypanosoma"

# Checking if any element in pref_name contains this target:
contains_alvo = ds['pref_name'].str.contains(alvo)

# Getting the indices of the rows with the defined target:
indices_com_alvo = ds[contains_alvo].index.tolist()

if contains_alvo.any():
    print(f"At least one element contains the term: {alvo}")
    print(f"Indices of the rows with the term'{alvo}': {indices_com_alvo}")
else:
    print(f"No element contains the term: {alvo}")

At least one element contains the term: Trypanosoma
Indices of the rows with the term'Trypanosoma': [0, 1, 2, 3, 4, 5, 6, 8, 9, 10]


### Converting IC50 values to a standard concentration unit (molar - M) and generating a single dataframe:

In [None]:
# prompt: select rows from df['target_chembl_id']

ensaios = ds[ds['target_chembl_id'].isin(['CHEMBL368', 'CHEMBL612849', 'CHEMBL612851', 'CHEMBL612348', 'CHEMBL613768'])]


In [None]:
indices_com_ensaio = ensaios.index
indices_com_ensaio

Index([1, 2, 8, 9, 10], dtype='int64')

In [None]:
# Creating a list to store individual DataFrames

dfs = []




# Iterating over the different indices:

for i in indices_com_ensaio :

    df_nM_i = []

    df_uM_i = []

    df_mM_i = []

    df_M_i = []

    ds_selecionado_i = ds.target_chembl_id[i]




    # Filtering bioactive compounds with IC50 data, in nM units, for each index:

    atividade = new_client.activity

    resultado_nM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="nM")




    # Filtering bioactive compounds with IC50 data, in µM units, for each index:

    resultado_uM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="uM")




    # Filtering bioactive compounds with IC50 data, in mM units, for each index:

    resultado_mM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="mM")




    # Filtering bioactive compounds with IC50 data, in M units, for each index:

    resultado_M = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="M")




    # Creating a DataFrame for each unit:

    df_nM_i = pd.DataFrame.from_dict(resultado_nM)

    df_uM_i = pd.DataFrame.from_dict(resultado_uM)

    df_mM_i = pd.DataFrame.from_dict(resultado_mM)

    df_M_i = pd.DataFrame.from_dict(resultado_M)




    # Converting each DataFrame to a standard unit (M):

    if not df_nM_i.empty and 'value' in df_nM_i:

        df_nM_i['value'] = df_nM_i['value'].astype(float)

        df_nM_i['value'] *= 1e-9

    else:

        pass




    if not df_uM_i.empty and 'value' in df_uM_i:

        df_uM_i['value'] = df_uM_i['value'].astype(float)

        df_uM_i['value'] *= 1e-6

    else:

        pass




    if not df_mM_i.empty and 'value' in df_mM_i:

        df_mM_i['value'] = df_mM_i['value'].astype(float)

        df_mM_i['value'] *= 1e-3

    else:

        pass




    if not df_M_i.empty and 'value' in df_M_i:

        df_M_i['value'] = df_M_i['value'].astype(float)

    else:

        pass




    # Adding the DataFrames to the list:

    dfs.append(df_nM_i)

    dfs.append(df_uM_i)

    dfs.append(df_mM_i)

    dfs.append(df_M_i)




# Concatenating the individual DataFrames into a single DataFrame:

df_assays = pd.concat(dfs, ignore_index=True)

df_assays['units'] = 'M'




# Displaying the final DataFrame:

display(df_assays)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,773784,[],CHEMBL814775,Compound was evaluated for anti-parasitic acti...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,3.500000e-08
1,,,1185841,[],CHEMBL719973,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,5.800000e-08
2,,,1185842,[],CHEMBL719974,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,1.600000e-08
3,,,1185844,[],CHEMBL719976,In vitro inhibitory concentration against mous...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,1.650000e-08
4,,,1185893,[],CHEMBL719973,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,7.100000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21639,,,25105234,[],CHEMBL5263053,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,5.900000e-06
21640,,,25105235,[],CHEMBL5263053,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,1.600000e-07
21641,,,25105309,[],CHEMBL5263092,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,1.200000e-06
21642,,,25105346,[],CHEMBL5263119,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,6.000000e-07


In [None]:
df_assays["value"].isnull().sum()

34

In [None]:
# Assuming your DataFrame is df_assays
df_assays.dropna(subset=['value'], inplace=True)

In [None]:
df_assays

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,773784,[],CHEMBL814775,Compound was evaluated for anti-parasitic acti...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,3.500000e-08
1,,,1185841,[],CHEMBL719973,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,5.800000e-08
2,,,1185842,[],CHEMBL719974,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,1.600000e-08
3,,,1185844,[],CHEMBL719976,In vitro inhibitory concentration against mous...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,1.650000e-08
4,,,1185893,[],CHEMBL719973,In vitro inhibitory concentration against mice...,F,,,BAO_0000190,...,Trypanosoma brucei,Trypanosoma brucei,5691,,,IC50,M,UO_0000065,,7.100000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21639,,,25105234,[],CHEMBL5263053,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,5.900000e-06
21640,,,25105235,[],CHEMBL5263053,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,1.600000e-07
21641,,,25105309,[],CHEMBL5263092,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,1.200000e-06
21642,,,25105346,[],CHEMBL5263119,Antiprotozoal activity against Trypanosoma bru...,F,,,BAO_0000190,...,Trypanosoma brucei rhodesiense,Trypanosoma brucei rhodesiense,31286,,,IC50,M,UO_0000065,,6.000000e-07


In [None]:
# Assigning the class of the compounds

classe_bioatividade = []
for i in df_assays.standard_value:
    if float(i) > 0.0001:
        classe_bioatividade.append("Inactive")
    elif float(i) <= 0.00001:
        classe_bioatividade.append("Active")
    else:
        classe_bioatividade.append("Intermediate")

In [None]:
# Visualizing bioactive compounds
df_assays.molecule_chembl_id

Unnamed: 0,molecule_chembl_id
0,CHEMBL55
1,CHEMBL9126
2,CHEMBL9126
3,CHEMBL9126
4,CHEMBL267154
5,CHEMBL267154
6,CHEMBL267154
7,CHEMBL55
8,CHEMBL55
9,CHEMBL55


In [None]:
# Iterating over bioactive compounds
mol_cid = []
for i in df_assays.molecule_chembl_id:
    mol_cid.append(i)

In [None]:
# Printing the variable mol_cid
mol_cid

['CHEMBL55',
 'CHEMBL9126',
 'CHEMBL9126',
 'CHEMBL9126',
 'CHEMBL267154',
 'CHEMBL267154',
 'CHEMBL267154',
 'CHEMBL55',
 'CHEMBL55',
 'CHEMBL55',
 'CHEMBL9341',
 'CHEMBL9341',
 'CHEMBL9341',
 'CHEMBL9025',
 'CHEMBL9025',
 'CHEMBL9025',
 'CHEMBL276449',
 'CHEMBL276449',
 'CHEMBL276449',
 'CHEMBL9531',
 'CHEMBL9531',
 'CHEMBL9531',
 'CHEMBL9516',
 'CHEMBL9516',
 'CHEMBL9516',
 'CHEMBL2311124',
 'CHEMBL2311124',
 'CHEMBL2311124',
 'CHEMBL9479',
 'CHEMBL9479',
 'CHEMBL9479',
 'CHEMBL166',
 'CHEMBL9053',
 'CHEMBL9053',
 'CHEMBL9053',
 'CHEMBL25526',
 'CHEMBL25526',
 'CHEMBL25526',
 'CHEMBL9121',
 'CHEMBL9121',
 'CHEMBL9121',
 'CHEMBL269032',
 'CHEMBL269032',
 'CHEMBL269032',
 'CHEMBL9185',
 'CHEMBL9185',
 'CHEMBL9185',
 'CHEMBL266808',
 'CHEMBL266808',
 'CHEMBL266808',
 'CHEMBL9490',
 'CHEMBL9490',
 'CHEMBL9490',
 'CHEMBL9391',
 'CHEMBL9391',
 'CHEMBL9391',
 'CHEMBL8892',
 'CHEMBL8892',
 'CHEMBL8892',
 'CHEMBL55',
 'CHEMBL35241',
 'CHEMBL265502',
 'CHEMBL192069',
 'CHEMBL1254447',
 'CHEMB

In [None]:
# Iterating canonical SMILES into a list
canonical_smiles = []
for i in df_assays.canonical_smiles:
    canonical_smiles.append(i)

In [None]:
# Iterating standard_value into a list
standard_value = []
for i in df_assays.standard_value:
    standard_value.append(i)

In [None]:
# Combining the four variables into the same DataFrame
dados_tupla = list(zip(mol_cid, canonical_smiles, classe_bioatividade, standard_value))
df3 = pd.DataFrame( dados_tupla,  columns=['molecule_chembl_id', 'canonical_smiles', 'classe_bioatividade', 'standard_value'])

In [None]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,classe_bioatividade,standard_value
0,CHEMBL55,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1,Inactive,35.0
1,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Inactive,58.0
2,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Inactive,16.0
3,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Inactive,16.5
4,CHEMBL267154,C(=C1/CCCN(c2ccc(C3=NCCN3)cc2)C1)\c1ccc(C2=NCC...,Inactive,71.0
...,...,...,...,...
21605,CHEMBL2314239,O=C(Cc1ccc(O)c(O)c1)NCCCCNCCCNC(=O)Cc1ccc(O)c(...,Inactive,5900.0
21606,CHEMBL41629,C(CCCNCCCNCC1CCCCCC1)CCCNCCCNCC1CCCCCC1,Inactive,160.0
21607,CHEMBL5274443,CN(C)CCCNc1cc(Cl)ccc1Sc1ccc(C(F)(F)F)cc1,Inactive,1200.0
21608,CHEMBL5285392,Br.C[N+](C)(CCCNc1cc(Cl)ccc1Sc1ccccc1)Cc1ccc([...,Inactive,600.0


In [None]:

# Saving the DataFrame to a CSV file

df3.to_csv('dados_preprocessados.csv', index=False)
