#**Combining the compound fingerprint descriptors dataset and the biological activity dataset (pIC50)**

In [1]:
## Tasks to be performed:
## Step 1: import the descriptors dataset (independent variables)
## step 2: Treat the descriptors dataset
## step 3: import the dataset of "pIC50" values ​​(dependent variable)
## Step 4: Treat the dependent variable dataset
## Step 5: Combine the two datasets (independent and dependent variables)
## Step 6: Save the final dataset

## **Step 1: import the descriptors dataset (independent variables)**

In [2]:
from google.colab import files
uploaded = files.upload()

Saving eNOS-pubchem_LEPRA.csv to eNOS-pubchem_LEPRA.csv


In [3]:
## 1.1. Viewing the imported dataset

import pandas as pd
df1 = pd.read_csv("eNOS-pubchem_LEPRA.csv")
df1

Unnamed: 0.1,Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,CHEMBL320553,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CHEMBL149676,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CHEMBL344282,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CHEMBL278590,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CHEMBL424133,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,1892,CHEMBL278501,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1893,1893,CHEMBL265334,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1894,1894,CHEMBL16428,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1895,1895,CHEMBL360583,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##**Step 2: Treat the descriptors dataset**

In [4]:
## 2.1. Eliminating uninformative variables: "Unnamed: 0" e "Name"
df1 = df1.drop("Unnamed: 0", axis = 1)
df1 = df1.drop("Name", axis = 1)

In [5]:
## 2.2. Visualizing the descriptor dataset after removing the non-informative variables.

df1

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1893,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1894,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1895,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


##**Step 3: import the dataset of "pIC50" values ​​(dependent variable)**

In [6]:
## 3.1: Importing the dataset containing the pI50 values ​​(pending variable)
from google.colab import files
uploaded = files.upload()

Saving PART0 3 LEPRA_3classes.csv to PART0 3 LEPRA_3classes.csv


In [7]:
## 3.2. Viewing the dependent variable data
df2 = pd.read_csv("PART0 3 LEPRA_3classes.csv")
display(df2)

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL320553,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2...,Active,457.526,5.73632,1.0,5.0,7.853872
1,1,CHEMBL149676,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(C)(Oc2ccccc2)C(...,Intermediate,457.526,5.73632,1.0,5.0,7.744727
2,2,CHEMBL344282,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(Oc2ccccc2)C(=O)...,Intermediate,443.499,5.34622,1.0,5.0,6.892790
3,3,CHEMBL278590,Cc1oc(C2CCCCC2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,Active,463.574,6.11702,1.0,5.0,7.823909
4,4,CHEMBL424133,Cc1oc(-c2cccs2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,Intermediate,463.555,5.79782,1.0,6.0,8.000000
...,...,...,...,...,...,...,...,...,...
1892,1892,CHEMBL278501,COc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(C...,Inactive,710.861,5.30690,1.0,11.0,6.847712
1893,1893,CHEMBL265334,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,Inactive,724.888,5.69700,1.0,11.0,7.468521
1894,1894,CHEMBL16428,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,Inactive,724.888,5.69700,1.0,11.0,7.096910
1895,1895,CHEMBL360583,COc1cccc(CCC2(O)C(C)=C[C@@H](OC(C)=O)[C@@]3(C)...,Intermediate,710.861,5.30690,1.0,11.0,7.619789


##Step 4: Handling the dataset of the dependent variable



In [8]:
## 4.1. Selecting only the dependent variable
df2 = df2["pIC50"]

In [9]:
## 4.2. Visualizing the dependent variable.

display(df2)

Unnamed: 0,pIC50
0,7.853872
1,7.744727
2,6.892790
3,7.823909
4,8.000000
...,...
1892,6.847712
1893,7.468521
1894,7.096910
1895,7.619789


## **Step 5: Combine the two datasets (independent and dependent variables)**

In [10]:
## 5.1. Observing whether the number of lines (samples) of both variables are equal
df1.shape, df2.shape

((1897, 881), (1897,))

In [11]:
## 5.2. Combining the independent and dependent variables

In [12]:
df3 = pd.concat([df1, df2], axis = 1)

In [13]:
## 5.3. Viewing the combined data
df3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.853872
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.744727
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.892790
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.823909
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.847712
1893,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.468521
1894,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.096910
1895,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.619789


In [14]:
# Removing rows with missing values.

df3_cleaned = df3.dropna()

In [15]:
df3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.853872
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.744727
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.892790
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.823909
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.847712
1893,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.468521
1894,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.096910
1895,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.619789


##**Step 6: Save the final dataset**

In [16]:
df3.to_csv("eNOS-dataset_final_LEPRA.csv")