#### Imports

In [1]:
import os
import json
import pandas as pd
import pubchempy as pcp

#### Process the dataframe

In [2]:
drugs = pd.read_csv(os.path.join('drugbank_dataframe.csv'), low_memory=False)

Selection of the columns of interest

In [3]:
sel_cols = ['H Bond Acceptor Count', 'H Bond Donor Count', 'Molecular Weight', 'logP', 'Rule of Five', 'SMILES', 'atc_code']

In [5]:
drugs_dataset = drugs[sel_cols]
drugs_dataset

Unnamed: 0,H Bond Acceptor Count,H Bond Donor Count,Molecular Weight,logP,Rule of Five,SMILES,atc_code
0,37.0,28.0,2180.2853,-0.76,0.0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,B01AE06
1,16.0,16.0,1209.3983,1.04,0.0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,L02AE51
2,18.0,17.0,1269.4105,0.30,0.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,L02AE03
3,16.0,20.0,1811.2530,4.38,0.0,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,R02AB30
4,15.0,14.0,1069.2200,-1.00,0.0,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,H01BA02
...,...,...,...,...,...,...,...
12222,,,,,,,
12223,,,431.5040,,,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...,
12224,,,394.8600,,,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...,
12225,,,850.7100,,,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...,


WE make a function that obtaines the CID number of a molecule from its InChI code and map it to obtain a new column with the CID

In [6]:
def get_cid_from_inchi(inchi):
    """Function that obtains the CID of a molecule from its InChI
    Input: molecule's InChi
    Output: molecule's CID
    """
    try:
        comp = pcp.get_compounds(inchi, 'inchi')
    except:
        print('Something went wrong obtaining the CID')
        return None
    return comp[0].cid

In [20]:
# This step can take up to 2 hours
drugs_dataset['CID']= drugs['InChI'].map(get_cid_from_inchi)

Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obta

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugs_dataset_atc['CID']= drugs_atc['InChI'].map(get_cid_from_inchi)


Changing the names and order of the columns to be the same in every dataset

In [7]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
column_order = res['column_order']

In [8]:
new_column_names = sorted(column_order)
new_column_names = new_column_names[0:4] + new_column_names[5:] + new_column_names[4:5]

columns = sorted(drugs_dataset.columns.to_list(), key=str.lower)


In [9]:
column_names_dict = {columns[i]: new_column_names[i] for i in range(len(columns))}

In [10]:
# Renaming
drugs_dataset = drugs_dataset.rename(columns=column_names_dict)

In [11]:
# Reorder
drugs_dataset = drugs_dataset[column_order]

KeyError: "['IsomericSMILES'] not in index"

Converting some data types

In [172]:
drugs_dataset[['RuleFive', 'CID']] = drugs_dataset[['RuleFive', 'CID']].astype('int32')

In [None]:
drugs_dataset['ATC_Code'] = drugs_dataset['ATC_Code'].str[0]

Split the data into labeled and unlabeled datasets

In [13]:
drugs_dataset_atc = drugs[drugs['ATC_Code'].isna() == False]
drugs_dataset_no_atc = drugs[drugs['ATC_Code'].isna()] 

Droping NaN

In [12]:
drugs_dataset_atc = drugs_dataset_atc.dropna()

drugs_dataset_no_atc = drugs_dataset_no_atc[
    ["CID", "HBondAcceptorCount", "HBondDonorCount", "MolecularWeight", "LogP", "RuleFive", "IsomericSMILES"]
    ].dropna(how='any')
drugs_dataset_no_atc['ATC_Code'] = None

NameError: name 'drugs_dataset_atc' is not defined

In [173]:
display(drugs_dataset_atc.info(), drugs_dataset_atc.head(3))
display(drugs_dataset_no_atc.info(), drugs_dataset_no_atc.head(3))

<class 'pandas.core.frame.DataFrame'>
Index: 2901 entries, 0 to 3023
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CID                 2901 non-null   int32  
 1   HBondAcceptorCount  2901 non-null   float64
 2   HBondDonorCount     2901 non-null   float64
 3   MolecularWeight     2901 non-null   float64
 4   LogP                2901 non-null   float64
 5   RuleFive            2901 non-null   int32  
 6   IsomericSMILES      2901 non-null   object 
 7   ATC_code            2901 non-null   object 
dtypes: float64(4), int32(2), object(2)
memory usage: 181.3+ KB


None

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_code
0,101041682,37.0,28.0,2180.2853,-0.76,0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,B01AE06
1,657181,16.0,16.0,1209.3983,1.04,0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,L02AE51
2,5311128,18.0,17.0,1269.4105,0.3,0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,L02AE03


<class 'pandas.core.frame.DataFrame'>
Index: 8457 entries, 0 to 9197
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CID                 8457 non-null   int32  
 1   HBondAcceptorCount  8457 non-null   float64
 2   HBondDonorCount     8457 non-null   float64
 3   MolecularWeight     8457 non-null   float64
 4   LogP                8457 non-null   float64
 5   RuleFive            8457 non-null   int32  
 6   IsomericSMILES      8457 non-null   object 
 7   ATC_code            0 non-null      object 
dtypes: float64(4), int32(2), object(2)
memory usage: 528.6+ KB


None

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_code
0,11979316,16.0,14.0,2140.46,-1.4,0,NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2...,
1,135444742,12.0,8.0,445.4292,-0.96,0,NC1=NC(=O)C2=C(NCC(CNC3=CC=C(C=C3)C(=O)N[C@@H]...,
2,6274,4.0,3.0,155.1546,-2.7,1,N[C@@H](CC1=CNC=N1)C(O)=O,


Saving the datasets

In [174]:
drugs_dataset_no_atc.to_csv('drugbank_dataset_nolabel_clean.csv', index=False)
drugs_dataset_atc.to_csv('drugbank_dataset_label_clean.csv', index=False)